[llvm] [RISCV] Enable load clustering by default (PR #73789)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 6 06:20:35 PST 2023


https://github.com/asb updated https://github.com/llvm/llvm-project/pull/73789

>From ecc1ca6822882fb4a4287ae1eaad1896e1d52ff5 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 10:38:57 +0000
Subject: [PATCH 1/6] [MachineScheduler][NFCI] Add Offset and OffsetIsScalable
 args to shouldClusterMemOps

These are picked up from getMemOperandsWithOffsetWidth but weren't then
being passed through to shouldClusterMemOps, which forces backends to
collect the information again if they want to use the kind of heuristics
typically used for the similar shouldScheduleLoadsNear function (e.g.
checking the offset is within 1 cache line).

This patch just adds the parameters, but doesn't attempt to use them.
There is an opportunity to use them in the current PPC and AArch64
shouldClusterMemOps implementation, and I intend to use the offset in
the heuristic for RISC-V. I've left these for future patches in the
interest of being as incremental as possible.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h    |  2 ++
 llvm/lib/CodeGen/MachineScheduler.cpp          | 14 +++++++++-----
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp   |  5 +++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.h     |  2 ++
 llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp |  5 ++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp         |  2 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h           |  2 ++
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp       |  5 +++--
 llvm/lib/Target/PowerPC/PPCInstrInfo.h         |  2 ++
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp       |  5 +++--
 llvm/lib/Target/RISCV/RISCVInstrInfo.h         |  2 ++
 11 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 665b7449ddb82..b7dc56f93c739 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1501,7 +1501,9 @@ class TargetInstrInfo : public MCInstrInfo {
   /// \p NumBytes is the number of bytes that will be loaded from all the
   /// clustered loads if this hook returns true.
   virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                   int64_t Offset1, bool OffsetIsScalable1,
                                    ArrayRef<const MachineOperand *> BaseOps2,
+                                   int64_t Offset2, bool OffsetIsScalable2,
                                    unsigned ClusterSize,
                                    unsigned NumBytes) const {
     llvm_unreachable("target did not implement shouldClusterMemOps()");
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4add33ba0996a..cd5fe71ef0c1a 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1698,11 +1698,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
     unsigned Width;
+    bool OffsetIsScalable;
 
     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
-              int64_t Offset, unsigned Width)
+              int64_t Offset, bool OffsetIsScalable, unsigned Width)
         : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
-          Width(Width) {}
+          OffsetIsScalable(OffsetIsScalable), Width(Width) {}
 
     static bool Compare(const MachineOperand *const &A,
                         const MachineOperand *const &B) {
@@ -1831,8 +1832,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
           SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
     }
 
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes))
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpa.Offset,
+                                  MemOpa.OffsetIsScalable, MemOpb.BaseOps,
+                                  MemOpb.Offset, MemOpb.OffsetIsScalable,
+                                  ClusterLength, CurrentClusterBytes))
       continue;
 
     SUnit *SUa = MemOpa.SU;
@@ -1899,7 +1902,8 @@ void BaseMemOpClusterMutation::collectMemOpRecords(
     unsigned Width;
     if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
                                            OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+      MemOpRecords.push_back(
+          MemOpInfo(&SU, BaseOps, Offset, OffsetIsScalable, Width));
 
       LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
                         << Offset << ", OffsetIsScalable: " << OffsetIsScalable
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e97f17e3f49c5..6b49e17528ada 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4230,8 +4230,9 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
 ///
 /// Only called for LdSt for which getMemOperandWithOffset returns true.
 bool AArch64InstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
   const MachineOperand &BaseOp1 = *BaseOps1.front();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index cc588cdad6b8e..65e5fb49536da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -179,7 +179,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                            int64_t &MinOffset, int64_t &MaxOffset);
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 50f8ad4433c6d..442ae4dd7b34f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -232,7 +232,10 @@ class SIInsertHardClauses : public MachineFunctionPass {
               // scheduler it limits the size of the cluster to avoid increasing
               // register pressure too much, but this pass runs after register
               // allocation so there is no need for that kind of limit.
-              !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+              // We also lie about the Offset and OffsetIsScalable parameters,
+              // as they aren't used in the SIInstrInfo implementation.
+              !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false,
+                                        2, 2)))) {
           // Finish the current clause.
           Changed |= emitClause(CI, SII);
           CI = ClauseInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d691254..0a06fa88b6b10 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -541,7 +541,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 }
 
 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                      int64_t Offset1, bool OffsetIsScalable1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
+                                      int64_t Offset2, bool OffsetIsScalable2,
                                       unsigned ClusterSize,
                                       unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index eba817756e9c5..6da4f74dfe5f3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -234,7 +234,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       const TargetRegisterInfo *TRI) const final;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 6784049348b16..0de795d9d5e81 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2878,8 +2878,9 @@ static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
 }
 
 bool PPCInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
 
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 31e9859a41739..0c9ad607418ec 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -530,7 +530,9 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   /// Returns true if the two given memory operations should be scheduled
   /// adjacent.
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2918e5654db4f..f596a4cd37528 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2266,8 +2266,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 }
 
 bool RISCVInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t Offset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
   // should not be clustered
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0954286a419bd..7e1d3f3118065 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -158,7 +158,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
       const TargetRegisterInfo *TRI) const override;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 

>From 5a1d0a9e7a893ae90d7b71cc4b819ef75487d23f Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 11:09:20 +0000
Subject: [PATCH 2/6] Add missing doc comment change

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index b7dc56f93c739..afff4e5894e85 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1496,6 +1496,9 @@ class TargetInstrInfo : public MCInstrInfo {
   /// to TargetPassConfig::createMachineScheduler() to have an effect.
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
+  /// \p Offset1 and \p Offset2 are the byte offsets for the memory
+  /// operations, while \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate
+  /// if the offset is scaled gby a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store
   /// cluster if this hook returns true.
   /// \p NumBytes is the number of bytes that will be loaded from all the

>From 2cada4e7e8754f556d7c8a9b2120a7f7747c981a Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 11:12:17 +0000
Subject: [PATCH 3/6] Note that Offset2 will never be less than Offset1

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index afff4e5894e85..ba7f70d66a6c0 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1497,8 +1497,10 @@ class TargetInstrInfo : public MCInstrInfo {
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
   /// \p Offset1 and \p Offset2 are the byte offsets for the memory
-  /// operations, while \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate
-  /// if the offset is scaled gby a runtime quantity.
+  /// operations, and \p Offset2 is guaranteed to be greater than or equal to
+  /// Offset1.
+  /// \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate if the offset is
+  /// scaled by a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store
   /// cluster if this hook returns true.
   /// \p NumBytes is the number of bytes that will be loaded from all the

>From 3b552e80d6bd43736abba4de388271628b9c74af Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 12:12:39 +0000
Subject: [PATCH 4/6] Actually Offset2 may be less than Offset1

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index ba7f70d66a6c0..4ec46f9bde1ad 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1497,8 +1497,7 @@ class TargetInstrInfo : public MCInstrInfo {
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
   /// \p Offset1 and \p Offset2 are the byte offsets for the memory
-  /// operations, and \p Offset2 is guaranteed to be greater than or equal to
-  /// Offset1.
+  /// operations.
   /// \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate if the offset is
   /// scaled by a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store

>From ceb5e024cde3d8879a59aa030d8cf98fe464f890 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 12:24:27 +0000
Subject: [PATCH 5/6] [RISCV] Enable load clustering by default

Also tweaks the heuristic to cluster if operations are within a cache
line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does
something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not
sure if that's intentionally set to 512 bytes or if the division is in
error.

Posting for comment and for people to test on their workloads, feedback
on ideas for a tweaked heuristic etc.

Stacks on top of #73778.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    6 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   16 +-
 llvm/test/CodeGen/RISCV/add-before-shl.ll     |   26 +-
 .../test/CodeGen/RISCV/callee-saved-fpr32s.ll |  720 ++--
 .../test/CodeGen/RISCV/callee-saved-fpr64s.ll |  496 +--
 llvm/test/CodeGen/RISCV/callee-saved-gprs.ll  | 1044 ++---
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll |   88 +-
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |   44 +-
 llvm/test/CodeGen/RISCV/iabs.ll               |  104 +-
 llvm/test/CodeGen/RISCV/idiv_large.ll         |    6 +-
 .../test/CodeGen/RISCV/intrinsic-cttz-elts.ll |   16 +-
 llvm/test/CodeGen/RISCV/legalize-fneg.ll      |    8 +-
 llvm/test/CodeGen/RISCV/llvm.exp10.ll         |   26 +-
 .../CodeGen/RISCV/misched-load-clustering.ll  |   17 +-
 llvm/test/CodeGen/RISCV/mul.ll                |   60 +-
 llvm/test/CodeGen/RISCV/nontemporal.ll        | 1420 +++---
 llvm/test/CodeGen/RISCV/push-pop-popret.ll    | 2004 ++++-----
 .../test/CodeGen/RISCV/reduction-formation.ll |   72 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |   44 +-
 llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll   |    4 +-
 llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll    |   44 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-elen.ll   |   34 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   22 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |   28 +-
 .../fixed-vectors-strided-load-store-asm.ll   |   16 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |  264 +-
 llvm/test/CodeGen/RISCV/rvv/pr63596.ll        |   12 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  500 +--
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |  182 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |  214 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |   98 +-
 .../CodeGen/RISCV/unaligned-load-store.ll     |   57 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |  166 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 2138 +++++-----
 .../RISCV/wide-scalar-shift-legalization.ll   | 3794 +++++++++--------
 llvm/test/CodeGen/RISCV/xtheadmempair.ll      |   14 +-
 36 files changed, 6915 insertions(+), 6889 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index f596a4cd37528..cf399f37d255c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2282,9 +2282,9 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  // A cache line is typically 64 bytes, so cluster if the memory ops are on
+  // the same or a neighbouring cache line.
+  return std::abs(Offset1 - Offset2) < 64;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b6c194f03f542..0954fbb8314c6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -95,11 +95,6 @@ static cl::opt<bool>
                         cl::desc("Enable Split RegisterAlloc for RVV"),
                         cl::init(false));
 
-static cl::opt<bool> EnableMISchedLoadClustering(
-    "riscv-misched-load-clustering", cl::Hidden,
-    cl::desc("Enable load clustering in the machine scheduler"),
-    cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -350,15 +345,10 @@ class RISCVPassConfig : public TargetPassConfig {
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
-    ScheduleDAGMILive *DAG = nullptr;
-    if (EnableMISchedLoadClustering) {
-      DAG = createGenericSchedLive(C);
-      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-    }
-    if (ST.hasMacroFusion()) {
-      DAG = DAG ? DAG : createGenericSchedLive(C);
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    if (ST.hasMacroFusion())
       DAG->addMutation(createRISCVMacroFusionDAGMutation());
-    }
     return DAG;
   }
 
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa9..3695a8a7f6086 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -200,25 +200,25 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    lw a6, 4(a1)
+; RV32C-NEXT:    lw a6, 8(a1)
 ; RV32C-NEXT:    c.lw a3, 12(a1)
-; RV32C-NEXT:    c.lw a4, 0(a1)
-; RV32C-NEXT:    c.lw a1, 8(a1)
+; RV32C-NEXT:    c.lw a2, 4(a1)
+; RV32C-NEXT:    c.lw a1, 0(a1)
 ; RV32C-NEXT:    c.lui a5, 16
 ; RV32C-NEXT:    c.add a3, a5
 ; RV32C-NEXT:    c.slli a3, 3
-; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a3, a5
-; RV32C-NEXT:    srli a5, a4, 29
-; RV32C-NEXT:    slli a2, a6, 3
-; RV32C-NEXT:    c.or a2, a5
 ; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    c.or a3, a5
+; RV32C-NEXT:    srli a5, a1, 29
+; RV32C-NEXT:    slli a4, a2, 3
+; RV32C-NEXT:    c.or a4, a5
+; RV32C-NEXT:    c.srli a2, 29
+; RV32C-NEXT:    c.slli a6, 3
+; RV32C-NEXT:    or a2, a6, a2
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a5
-; RV32C-NEXT:    c.slli a4, 3
-; RV32C-NEXT:    c.sw a4, 0(a0)
-; RV32C-NEXT:    c.sw a1, 8(a0)
-; RV32C-NEXT:    c.sw a2, 4(a0)
+; RV32C-NEXT:    c.sw a1, 0(a0)
+; RV32C-NEXT:    c.sw a2, 8(a0)
+; RV32C-NEXT:    c.sw a4, 4(a0)
 ; RV32C-NEXT:    c.sw a3, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 7111316931f19..7aeaaab68a208 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -25,69 +25,69 @@ define void @callee() nounwind {
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
 ; ILP32-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    flw fa1, 16(a1)
-; ILP32-NEXT:    flw fa0, 20(a1)
-; ILP32-NEXT:    flw ft0, 24(a1)
-; ILP32-NEXT:    flw ft1, 28(a1)
-; ILP32-NEXT:    flw ft2, 32(a1)
-; ILP32-NEXT:    flw ft3, 36(a1)
-; ILP32-NEXT:    flw ft4, 40(a1)
-; ILP32-NEXT:    flw ft5, 44(a1)
-; ILP32-NEXT:    flw ft6, 48(a1)
-; ILP32-NEXT:    flw ft7, 52(a1)
-; ILP32-NEXT:    flw fa6, 56(a1)
-; ILP32-NEXT:    flw fa7, 60(a1)
-; ILP32-NEXT:    flw ft8, 64(a1)
-; ILP32-NEXT:    flw ft9, 68(a1)
-; ILP32-NEXT:    flw ft10, 72(a1)
-; ILP32-NEXT:    flw ft11, 76(a1)
-; ILP32-NEXT:    flw fs0, 80(a1)
-; ILP32-NEXT:    flw fs1, 84(a1)
-; ILP32-NEXT:    flw fs2, 88(a1)
-; ILP32-NEXT:    flw fs3, 92(a1)
-; ILP32-NEXT:    flw fs4, 96(a1)
-; ILP32-NEXT:    flw fs5, 100(a1)
-; ILP32-NEXT:    flw fs6, 104(a1)
-; ILP32-NEXT:    flw fs7, 108(a1)
+; ILP32-NEXT:    flw fa4, 16(a1)
+; ILP32-NEXT:    flw fa3, 20(a1)
+; ILP32-NEXT:    flw fa2, 24(a1)
+; ILP32-NEXT:    flw fa1, 28(a1)
+; ILP32-NEXT:    flw fa0, 32(a1)
+; ILP32-NEXT:    flw ft0, 36(a1)
+; ILP32-NEXT:    flw ft1, 40(a1)
+; ILP32-NEXT:    flw ft2, 44(a1)
+; ILP32-NEXT:    flw ft3, 48(a1)
+; ILP32-NEXT:    flw ft4, 52(a1)
+; ILP32-NEXT:    flw ft5, 56(a1)
+; ILP32-NEXT:    flw ft6, 60(a1)
+; ILP32-NEXT:    flw ft7, 64(a1)
+; ILP32-NEXT:    flw fa6, 68(a1)
+; ILP32-NEXT:    flw fa7, 72(a1)
+; ILP32-NEXT:    flw ft8, 76(a1)
+; ILP32-NEXT:    flw ft9, 80(a1)
+; ILP32-NEXT:    flw ft10, 84(a1)
+; ILP32-NEXT:    flw ft11, 88(a1)
+; ILP32-NEXT:    flw fs0, 92(a1)
+; ILP32-NEXT:    flw fs1, 96(a1)
+; ILP32-NEXT:    flw fs2, 100(a1)
+; ILP32-NEXT:    flw fs3, 104(a1)
+; ILP32-NEXT:    flw fs4, 108(a1)
+; ILP32-NEXT:    flw fs5, 112(a1)
+; ILP32-NEXT:    flw fs6, 116(a1)
+; ILP32-NEXT:    flw fs7, 120(a1)
 ; ILP32-NEXT:    flw fs8, 124(a1)
-; ILP32-NEXT:    flw fs9, 120(a1)
-; ILP32-NEXT:    flw fs10, 116(a1)
-; ILP32-NEXT:    flw fs11, 112(a1)
+; ILP32-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32-NEXT:    fsw fs8, 124(a1)
-; ILP32-NEXT:    fsw fs9, 120(a1)
-; ILP32-NEXT:    fsw fs10, 116(a1)
-; ILP32-NEXT:    fsw fs11, 112(a1)
-; ILP32-NEXT:    fsw fs7, 108(a1)
-; ILP32-NEXT:    fsw fs6, 104(a1)
-; ILP32-NEXT:    fsw fs5, 100(a1)
-; ILP32-NEXT:    fsw fs4, 96(a1)
-; ILP32-NEXT:    fsw fs3, 92(a1)
-; ILP32-NEXT:    fsw fs2, 88(a1)
-; ILP32-NEXT:    fsw fs1, 84(a1)
-; ILP32-NEXT:    fsw fs0, 80(a1)
-; ILP32-NEXT:    fsw ft11, 76(a1)
-; ILP32-NEXT:    fsw ft10, 72(a1)
-; ILP32-NEXT:    fsw ft9, 68(a1)
-; ILP32-NEXT:    fsw ft8, 64(a1)
-; ILP32-NEXT:    fsw fa7, 60(a1)
-; ILP32-NEXT:    fsw fa6, 56(a1)
-; ILP32-NEXT:    fsw ft7, 52(a1)
-; ILP32-NEXT:    fsw ft6, 48(a1)
-; ILP32-NEXT:    fsw ft5, 44(a1)
-; ILP32-NEXT:    fsw ft4, 40(a1)
-; ILP32-NEXT:    fsw ft3, 36(a1)
-; ILP32-NEXT:    fsw ft2, 32(a1)
-; ILP32-NEXT:    fsw ft1, 28(a1)
-; ILP32-NEXT:    fsw ft0, 24(a1)
-; ILP32-NEXT:    fsw fa0, 20(a1)
-; ILP32-NEXT:    fsw fa1, 16(a1)
-; ILP32-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32-NEXT:    fsw fs7, 120(a1)
+; ILP32-NEXT:    fsw fs6, 116(a1)
+; ILP32-NEXT:    fsw fs5, 112(a1)
+; ILP32-NEXT:    fsw fs4, 108(a1)
+; ILP32-NEXT:    fsw fs3, 104(a1)
+; ILP32-NEXT:    fsw fs2, 100(a1)
+; ILP32-NEXT:    fsw fs1, 96(a1)
+; ILP32-NEXT:    fsw fs0, 92(a1)
+; ILP32-NEXT:    fsw ft11, 88(a1)
+; ILP32-NEXT:    fsw ft10, 84(a1)
+; ILP32-NEXT:    fsw ft9, 80(a1)
+; ILP32-NEXT:    fsw ft8, 76(a1)
+; ILP32-NEXT:    fsw fa7, 72(a1)
+; ILP32-NEXT:    fsw fa6, 68(a1)
+; ILP32-NEXT:    fsw ft7, 64(a1)
+; ILP32-NEXT:    fsw ft6, 60(a1)
+; ILP32-NEXT:    fsw ft5, 56(a1)
+; ILP32-NEXT:    fsw ft4, 52(a1)
+; ILP32-NEXT:    fsw ft3, 48(a1)
+; ILP32-NEXT:    fsw ft2, 44(a1)
+; ILP32-NEXT:    fsw ft1, 40(a1)
+; ILP32-NEXT:    fsw ft0, 36(a1)
+; ILP32-NEXT:    fsw fa0, 32(a1)
+; ILP32-NEXT:    fsw fa1, 28(a1)
+; ILP32-NEXT:    fsw fa2, 24(a1)
+; ILP32-NEXT:    fsw fa3, 20(a1)
+; ILP32-NEXT:    fsw fa4, 16(a1)
+; ILP32-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
@@ -95,69 +95,69 @@ define void @callee() nounwind {
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
 ; LP64-NEXT:    flw fa5, %lo(var)(a0)
-; LP64-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    flw fa1, 16(a1)
-; LP64-NEXT:    flw fa0, 20(a1)
-; LP64-NEXT:    flw ft0, 24(a1)
-; LP64-NEXT:    flw ft1, 28(a1)
-; LP64-NEXT:    flw ft2, 32(a1)
-; LP64-NEXT:    flw ft3, 36(a1)
-; LP64-NEXT:    flw ft4, 40(a1)
-; LP64-NEXT:    flw ft5, 44(a1)
-; LP64-NEXT:    flw ft6, 48(a1)
-; LP64-NEXT:    flw ft7, 52(a1)
-; LP64-NEXT:    flw fa6, 56(a1)
-; LP64-NEXT:    flw fa7, 60(a1)
-; LP64-NEXT:    flw ft8, 64(a1)
-; LP64-NEXT:    flw ft9, 68(a1)
-; LP64-NEXT:    flw ft10, 72(a1)
-; LP64-NEXT:    flw ft11, 76(a1)
-; LP64-NEXT:    flw fs0, 80(a1)
-; LP64-NEXT:    flw fs1, 84(a1)
-; LP64-NEXT:    flw fs2, 88(a1)
-; LP64-NEXT:    flw fs3, 92(a1)
-; LP64-NEXT:    flw fs4, 96(a1)
-; LP64-NEXT:    flw fs5, 100(a1)
-; LP64-NEXT:    flw fs6, 104(a1)
-; LP64-NEXT:    flw fs7, 108(a1)
+; LP64-NEXT:    flw fa4, 16(a1)
+; LP64-NEXT:    flw fa3, 20(a1)
+; LP64-NEXT:    flw fa2, 24(a1)
+; LP64-NEXT:    flw fa1, 28(a1)
+; LP64-NEXT:    flw fa0, 32(a1)
+; LP64-NEXT:    flw ft0, 36(a1)
+; LP64-NEXT:    flw ft1, 40(a1)
+; LP64-NEXT:    flw ft2, 44(a1)
+; LP64-NEXT:    flw ft3, 48(a1)
+; LP64-NEXT:    flw ft4, 52(a1)
+; LP64-NEXT:    flw ft5, 56(a1)
+; LP64-NEXT:    flw ft6, 60(a1)
+; LP64-NEXT:    flw ft7, 64(a1)
+; LP64-NEXT:    flw fa6, 68(a1)
+; LP64-NEXT:    flw fa7, 72(a1)
+; LP64-NEXT:    flw ft8, 76(a1)
+; LP64-NEXT:    flw ft9, 80(a1)
+; LP64-NEXT:    flw ft10, 84(a1)
+; LP64-NEXT:    flw ft11, 88(a1)
+; LP64-NEXT:    flw fs0, 92(a1)
+; LP64-NEXT:    flw fs1, 96(a1)
+; LP64-NEXT:    flw fs2, 100(a1)
+; LP64-NEXT:    flw fs3, 104(a1)
+; LP64-NEXT:    flw fs4, 108(a1)
+; LP64-NEXT:    flw fs5, 112(a1)
+; LP64-NEXT:    flw fs6, 116(a1)
+; LP64-NEXT:    flw fs7, 120(a1)
 ; LP64-NEXT:    flw fs8, 124(a1)
-; LP64-NEXT:    flw fs9, 120(a1)
-; LP64-NEXT:    flw fs10, 116(a1)
-; LP64-NEXT:    flw fs11, 112(a1)
+; LP64-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64-NEXT:    fsw fs8, 124(a1)
-; LP64-NEXT:    fsw fs9, 120(a1)
-; LP64-NEXT:    fsw fs10, 116(a1)
-; LP64-NEXT:    fsw fs11, 112(a1)
-; LP64-NEXT:    fsw fs7, 108(a1)
-; LP64-NEXT:    fsw fs6, 104(a1)
-; LP64-NEXT:    fsw fs5, 100(a1)
-; LP64-NEXT:    fsw fs4, 96(a1)
-; LP64-NEXT:    fsw fs3, 92(a1)
-; LP64-NEXT:    fsw fs2, 88(a1)
-; LP64-NEXT:    fsw fs1, 84(a1)
-; LP64-NEXT:    fsw fs0, 80(a1)
-; LP64-NEXT:    fsw ft11, 76(a1)
-; LP64-NEXT:    fsw ft10, 72(a1)
-; LP64-NEXT:    fsw ft9, 68(a1)
-; LP64-NEXT:    fsw ft8, 64(a1)
-; LP64-NEXT:    fsw fa7, 60(a1)
-; LP64-NEXT:    fsw fa6, 56(a1)
-; LP64-NEXT:    fsw ft7, 52(a1)
-; LP64-NEXT:    fsw ft6, 48(a1)
-; LP64-NEXT:    fsw ft5, 44(a1)
-; LP64-NEXT:    fsw ft4, 40(a1)
-; LP64-NEXT:    fsw ft3, 36(a1)
-; LP64-NEXT:    fsw ft2, 32(a1)
-; LP64-NEXT:    fsw ft1, 28(a1)
-; LP64-NEXT:    fsw ft0, 24(a1)
-; LP64-NEXT:    fsw fa0, 20(a1)
-; LP64-NEXT:    fsw fa1, 16(a1)
-; LP64-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64-NEXT:    fsw fs7, 120(a1)
+; LP64-NEXT:    fsw fs6, 116(a1)
+; LP64-NEXT:    fsw fs5, 112(a1)
+; LP64-NEXT:    fsw fs4, 108(a1)
+; LP64-NEXT:    fsw fs3, 104(a1)
+; LP64-NEXT:    fsw fs2, 100(a1)
+; LP64-NEXT:    fsw fs1, 96(a1)
+; LP64-NEXT:    fsw fs0, 92(a1)
+; LP64-NEXT:    fsw ft11, 88(a1)
+; LP64-NEXT:    fsw ft10, 84(a1)
+; LP64-NEXT:    fsw ft9, 80(a1)
+; LP64-NEXT:    fsw ft8, 76(a1)
+; LP64-NEXT:    fsw fa7, 72(a1)
+; LP64-NEXT:    fsw fa6, 68(a1)
+; LP64-NEXT:    fsw ft7, 64(a1)
+; LP64-NEXT:    fsw ft6, 60(a1)
+; LP64-NEXT:    fsw ft5, 56(a1)
+; LP64-NEXT:    fsw ft4, 52(a1)
+; LP64-NEXT:    fsw ft3, 48(a1)
+; LP64-NEXT:    fsw ft2, 44(a1)
+; LP64-NEXT:    fsw ft1, 40(a1)
+; LP64-NEXT:    fsw ft0, 36(a1)
+; LP64-NEXT:    fsw fa0, 32(a1)
+; LP64-NEXT:    fsw fa1, 28(a1)
+; LP64-NEXT:    fsw fa2, 24(a1)
+; LP64-NEXT:    fsw fa3, 20(a1)
+; LP64-NEXT:    fsw fa4, 16(a1)
+; LP64-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64-NEXT:    ret
 ;
@@ -178,69 +178,69 @@ define void @callee() nounwind {
 ; ILP32F-NEXT:    fsw fs11, 0(sp) # 4-byte Folded Spill
 ; ILP32F-NEXT:    lui a0, %hi(var)
 ; ILP32F-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32F-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32F-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32F-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32F-NEXT:    addi a1, a0, %lo(var)
-; ILP32F-NEXT:    flw fa1, 16(a1)
-; ILP32F-NEXT:    flw fa0, 20(a1)
-; ILP32F-NEXT:    flw ft0, 24(a1)
-; ILP32F-NEXT:    flw ft1, 28(a1)
-; ILP32F-NEXT:    flw ft2, 32(a1)
-; ILP32F-NEXT:    flw ft3, 36(a1)
-; ILP32F-NEXT:    flw ft4, 40(a1)
-; ILP32F-NEXT:    flw ft5, 44(a1)
-; ILP32F-NEXT:    flw ft6, 48(a1)
-; ILP32F-NEXT:    flw ft7, 52(a1)
-; ILP32F-NEXT:    flw fa6, 56(a1)
-; ILP32F-NEXT:    flw fa7, 60(a1)
-; ILP32F-NEXT:    flw ft8, 64(a1)
-; ILP32F-NEXT:    flw ft9, 68(a1)
-; ILP32F-NEXT:    flw ft10, 72(a1)
-; ILP32F-NEXT:    flw ft11, 76(a1)
-; ILP32F-NEXT:    flw fs0, 80(a1)
-; ILP32F-NEXT:    flw fs1, 84(a1)
-; ILP32F-NEXT:    flw fs2, 88(a1)
-; ILP32F-NEXT:    flw fs3, 92(a1)
-; ILP32F-NEXT:    flw fs4, 96(a1)
-; ILP32F-NEXT:    flw fs5, 100(a1)
-; ILP32F-NEXT:    flw fs6, 104(a1)
-; ILP32F-NEXT:    flw fs7, 108(a1)
+; ILP32F-NEXT:    flw fa4, 16(a1)
+; ILP32F-NEXT:    flw fa3, 20(a1)
+; ILP32F-NEXT:    flw fa2, 24(a1)
+; ILP32F-NEXT:    flw fa1, 28(a1)
+; ILP32F-NEXT:    flw fa0, 32(a1)
+; ILP32F-NEXT:    flw ft0, 36(a1)
+; ILP32F-NEXT:    flw ft1, 40(a1)
+; ILP32F-NEXT:    flw ft2, 44(a1)
+; ILP32F-NEXT:    flw ft3, 48(a1)
+; ILP32F-NEXT:    flw ft4, 52(a1)
+; ILP32F-NEXT:    flw ft5, 56(a1)
+; ILP32F-NEXT:    flw ft6, 60(a1)
+; ILP32F-NEXT:    flw ft7, 64(a1)
+; ILP32F-NEXT:    flw fa6, 68(a1)
+; ILP32F-NEXT:    flw fa7, 72(a1)
+; ILP32F-NEXT:    flw ft8, 76(a1)
+; ILP32F-NEXT:    flw ft9, 80(a1)
+; ILP32F-NEXT:    flw ft10, 84(a1)
+; ILP32F-NEXT:    flw ft11, 88(a1)
+; ILP32F-NEXT:    flw fs0, 92(a1)
+; ILP32F-NEXT:    flw fs1, 96(a1)
+; ILP32F-NEXT:    flw fs2, 100(a1)
+; ILP32F-NEXT:    flw fs3, 104(a1)
+; ILP32F-NEXT:    flw fs4, 108(a1)
+; ILP32F-NEXT:    flw fs5, 112(a1)
+; ILP32F-NEXT:    flw fs6, 116(a1)
+; ILP32F-NEXT:    flw fs7, 120(a1)
 ; ILP32F-NEXT:    flw fs8, 124(a1)
-; ILP32F-NEXT:    flw fs9, 120(a1)
-; ILP32F-NEXT:    flw fs10, 116(a1)
-; ILP32F-NEXT:    flw fs11, 112(a1)
+; ILP32F-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32F-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32F-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32F-NEXT:    fsw fs8, 124(a1)
-; ILP32F-NEXT:    fsw fs9, 120(a1)
-; ILP32F-NEXT:    fsw fs10, 116(a1)
-; ILP32F-NEXT:    fsw fs11, 112(a1)
-; ILP32F-NEXT:    fsw fs7, 108(a1)
-; ILP32F-NEXT:    fsw fs6, 104(a1)
-; ILP32F-NEXT:    fsw fs5, 100(a1)
-; ILP32F-NEXT:    fsw fs4, 96(a1)
-; ILP32F-NEXT:    fsw fs3, 92(a1)
-; ILP32F-NEXT:    fsw fs2, 88(a1)
-; ILP32F-NEXT:    fsw fs1, 84(a1)
-; ILP32F-NEXT:    fsw fs0, 80(a1)
-; ILP32F-NEXT:    fsw ft11, 76(a1)
-; ILP32F-NEXT:    fsw ft10, 72(a1)
-; ILP32F-NEXT:    fsw ft9, 68(a1)
-; ILP32F-NEXT:    fsw ft8, 64(a1)
-; ILP32F-NEXT:    fsw fa7, 60(a1)
-; ILP32F-NEXT:    fsw fa6, 56(a1)
-; ILP32F-NEXT:    fsw ft7, 52(a1)
-; ILP32F-NEXT:    fsw ft6, 48(a1)
-; ILP32F-NEXT:    fsw ft5, 44(a1)
-; ILP32F-NEXT:    fsw ft4, 40(a1)
-; ILP32F-NEXT:    fsw ft3, 36(a1)
-; ILP32F-NEXT:    fsw ft2, 32(a1)
-; ILP32F-NEXT:    fsw ft1, 28(a1)
-; ILP32F-NEXT:    fsw ft0, 24(a1)
-; ILP32F-NEXT:    fsw fa0, 20(a1)
-; ILP32F-NEXT:    fsw fa1, 16(a1)
-; ILP32F-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32F-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32F-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32F-NEXT:    fsw fs7, 120(a1)
+; ILP32F-NEXT:    fsw fs6, 116(a1)
+; ILP32F-NEXT:    fsw fs5, 112(a1)
+; ILP32F-NEXT:    fsw fs4, 108(a1)
+; ILP32F-NEXT:    fsw fs3, 104(a1)
+; ILP32F-NEXT:    fsw fs2, 100(a1)
+; ILP32F-NEXT:    fsw fs1, 96(a1)
+; ILP32F-NEXT:    fsw fs0, 92(a1)
+; ILP32F-NEXT:    fsw ft11, 88(a1)
+; ILP32F-NEXT:    fsw ft10, 84(a1)
+; ILP32F-NEXT:    fsw ft9, 80(a1)
+; ILP32F-NEXT:    fsw ft8, 76(a1)
+; ILP32F-NEXT:    fsw fa7, 72(a1)
+; ILP32F-NEXT:    fsw fa6, 68(a1)
+; ILP32F-NEXT:    fsw ft7, 64(a1)
+; ILP32F-NEXT:    fsw ft6, 60(a1)
+; ILP32F-NEXT:    fsw ft5, 56(a1)
+; ILP32F-NEXT:    fsw ft4, 52(a1)
+; ILP32F-NEXT:    fsw ft3, 48(a1)
+; ILP32F-NEXT:    fsw ft2, 44(a1)
+; ILP32F-NEXT:    fsw ft1, 40(a1)
+; ILP32F-NEXT:    fsw ft0, 36(a1)
+; ILP32F-NEXT:    fsw fa0, 32(a1)
+; ILP32F-NEXT:    fsw fa1, 28(a1)
+; ILP32F-NEXT:    fsw fa2, 24(a1)
+; ILP32F-NEXT:    fsw fa3, 20(a1)
+; ILP32F-NEXT:    fsw fa4, 16(a1)
+; ILP32F-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32F-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32F-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32F-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32F-NEXT:    flw fs0, 44(sp) # 4-byte Folded Reload
 ; ILP32F-NEXT:    flw fs1, 40(sp) # 4-byte Folded Reload
@@ -274,69 +274,69 @@ define void @callee() nounwind {
 ; LP64F-NEXT:    fsw fs11, 0(sp) # 4-byte Folded Spill
 ; LP64F-NEXT:    lui a0, %hi(var)
 ; LP64F-NEXT:    flw fa5, %lo(var)(a0)
-; LP64F-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64F-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64F-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64F-NEXT:    addi a1, a0, %lo(var)
-; LP64F-NEXT:    flw fa1, 16(a1)
-; LP64F-NEXT:    flw fa0, 20(a1)
-; LP64F-NEXT:    flw ft0, 24(a1)
-; LP64F-NEXT:    flw ft1, 28(a1)
-; LP64F-NEXT:    flw ft2, 32(a1)
-; LP64F-NEXT:    flw ft3, 36(a1)
-; LP64F-NEXT:    flw ft4, 40(a1)
-; LP64F-NEXT:    flw ft5, 44(a1)
-; LP64F-NEXT:    flw ft6, 48(a1)
-; LP64F-NEXT:    flw ft7, 52(a1)
-; LP64F-NEXT:    flw fa6, 56(a1)
-; LP64F-NEXT:    flw fa7, 60(a1)
-; LP64F-NEXT:    flw ft8, 64(a1)
-; LP64F-NEXT:    flw ft9, 68(a1)
-; LP64F-NEXT:    flw ft10, 72(a1)
-; LP64F-NEXT:    flw ft11, 76(a1)
-; LP64F-NEXT:    flw fs0, 80(a1)
-; LP64F-NEXT:    flw fs1, 84(a1)
-; LP64F-NEXT:    flw fs2, 88(a1)
-; LP64F-NEXT:    flw fs3, 92(a1)
-; LP64F-NEXT:    flw fs4, 96(a1)
-; LP64F-NEXT:    flw fs5, 100(a1)
-; LP64F-NEXT:    flw fs6, 104(a1)
-; LP64F-NEXT:    flw fs7, 108(a1)
+; LP64F-NEXT:    flw fa4, 16(a1)
+; LP64F-NEXT:    flw fa3, 20(a1)
+; LP64F-NEXT:    flw fa2, 24(a1)
+; LP64F-NEXT:    flw fa1, 28(a1)
+; LP64F-NEXT:    flw fa0, 32(a1)
+; LP64F-NEXT:    flw ft0, 36(a1)
+; LP64F-NEXT:    flw ft1, 40(a1)
+; LP64F-NEXT:    flw ft2, 44(a1)
+; LP64F-NEXT:    flw ft3, 48(a1)
+; LP64F-NEXT:    flw ft4, 52(a1)
+; LP64F-NEXT:    flw ft5, 56(a1)
+; LP64F-NEXT:    flw ft6, 60(a1)
+; LP64F-NEXT:    flw ft7, 64(a1)
+; LP64F-NEXT:    flw fa6, 68(a1)
+; LP64F-NEXT:    flw fa7, 72(a1)
+; LP64F-NEXT:    flw ft8, 76(a1)
+; LP64F-NEXT:    flw ft9, 80(a1)
+; LP64F-NEXT:    flw ft10, 84(a1)
+; LP64F-NEXT:    flw ft11, 88(a1)
+; LP64F-NEXT:    flw fs0, 92(a1)
+; LP64F-NEXT:    flw fs1, 96(a1)
+; LP64F-NEXT:    flw fs2, 100(a1)
+; LP64F-NEXT:    flw fs3, 104(a1)
+; LP64F-NEXT:    flw fs4, 108(a1)
+; LP64F-NEXT:    flw fs5, 112(a1)
+; LP64F-NEXT:    flw fs6, 116(a1)
+; LP64F-NEXT:    flw fs7, 120(a1)
 ; LP64F-NEXT:    flw fs8, 124(a1)
-; LP64F-NEXT:    flw fs9, 120(a1)
-; LP64F-NEXT:    flw fs10, 116(a1)
-; LP64F-NEXT:    flw fs11, 112(a1)
+; LP64F-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64F-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64F-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64F-NEXT:    fsw fs8, 124(a1)
-; LP64F-NEXT:    fsw fs9, 120(a1)
-; LP64F-NEXT:    fsw fs10, 116(a1)
-; LP64F-NEXT:    fsw fs11, 112(a1)
-; LP64F-NEXT:    fsw fs7, 108(a1)
-; LP64F-NEXT:    fsw fs6, 104(a1)
-; LP64F-NEXT:    fsw fs5, 100(a1)
-; LP64F-NEXT:    fsw fs4, 96(a1)
-; LP64F-NEXT:    fsw fs3, 92(a1)
-; LP64F-NEXT:    fsw fs2, 88(a1)
-; LP64F-NEXT:    fsw fs1, 84(a1)
-; LP64F-NEXT:    fsw fs0, 80(a1)
-; LP64F-NEXT:    fsw ft11, 76(a1)
-; LP64F-NEXT:    fsw ft10, 72(a1)
-; LP64F-NEXT:    fsw ft9, 68(a1)
-; LP64F-NEXT:    fsw ft8, 64(a1)
-; LP64F-NEXT:    fsw fa7, 60(a1)
-; LP64F-NEXT:    fsw fa6, 56(a1)
-; LP64F-NEXT:    fsw ft7, 52(a1)
-; LP64F-NEXT:    fsw ft6, 48(a1)
-; LP64F-NEXT:    fsw ft5, 44(a1)
-; LP64F-NEXT:    fsw ft4, 40(a1)
-; LP64F-NEXT:    fsw ft3, 36(a1)
-; LP64F-NEXT:    fsw ft2, 32(a1)
-; LP64F-NEXT:    fsw ft1, 28(a1)
-; LP64F-NEXT:    fsw ft0, 24(a1)
-; LP64F-NEXT:    fsw fa0, 20(a1)
-; LP64F-NEXT:    fsw fa1, 16(a1)
-; LP64F-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64F-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64F-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64F-NEXT:    fsw fs7, 120(a1)
+; LP64F-NEXT:    fsw fs6, 116(a1)
+; LP64F-NEXT:    fsw fs5, 112(a1)
+; LP64F-NEXT:    fsw fs4, 108(a1)
+; LP64F-NEXT:    fsw fs3, 104(a1)
+; LP64F-NEXT:    fsw fs2, 100(a1)
+; LP64F-NEXT:    fsw fs1, 96(a1)
+; LP64F-NEXT:    fsw fs0, 92(a1)
+; LP64F-NEXT:    fsw ft11, 88(a1)
+; LP64F-NEXT:    fsw ft10, 84(a1)
+; LP64F-NEXT:    fsw ft9, 80(a1)
+; LP64F-NEXT:    fsw ft8, 76(a1)
+; LP64F-NEXT:    fsw fa7, 72(a1)
+; LP64F-NEXT:    fsw fa6, 68(a1)
+; LP64F-NEXT:    fsw ft7, 64(a1)
+; LP64F-NEXT:    fsw ft6, 60(a1)
+; LP64F-NEXT:    fsw ft5, 56(a1)
+; LP64F-NEXT:    fsw ft4, 52(a1)
+; LP64F-NEXT:    fsw ft3, 48(a1)
+; LP64F-NEXT:    fsw ft2, 44(a1)
+; LP64F-NEXT:    fsw ft1, 40(a1)
+; LP64F-NEXT:    fsw ft0, 36(a1)
+; LP64F-NEXT:    fsw fa0, 32(a1)
+; LP64F-NEXT:    fsw fa1, 28(a1)
+; LP64F-NEXT:    fsw fa2, 24(a1)
+; LP64F-NEXT:    fsw fa3, 20(a1)
+; LP64F-NEXT:    fsw fa4, 16(a1)
+; LP64F-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64F-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64F-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64F-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64F-NEXT:    flw fs0, 44(sp) # 4-byte Folded Reload
 ; LP64F-NEXT:    flw fs1, 40(sp) # 4-byte Folded Reload
@@ -370,69 +370,69 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    lui a0, %hi(var)
 ; ILP32D-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32D-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32D-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32D-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32D-NEXT:    addi a1, a0, %lo(var)
-; ILP32D-NEXT:    flw fa1, 16(a1)
-; ILP32D-NEXT:    flw fa0, 20(a1)
-; ILP32D-NEXT:    flw ft0, 24(a1)
-; ILP32D-NEXT:    flw ft1, 28(a1)
-; ILP32D-NEXT:    flw ft2, 32(a1)
-; ILP32D-NEXT:    flw ft3, 36(a1)
-; ILP32D-NEXT:    flw ft4, 40(a1)
-; ILP32D-NEXT:    flw ft5, 44(a1)
-; ILP32D-NEXT:    flw ft6, 48(a1)
-; ILP32D-NEXT:    flw ft7, 52(a1)
-; ILP32D-NEXT:    flw fa6, 56(a1)
-; ILP32D-NEXT:    flw fa7, 60(a1)
-; ILP32D-NEXT:    flw ft8, 64(a1)
-; ILP32D-NEXT:    flw ft9, 68(a1)
-; ILP32D-NEXT:    flw ft10, 72(a1)
-; ILP32D-NEXT:    flw ft11, 76(a1)
-; ILP32D-NEXT:    flw fs0, 80(a1)
-; ILP32D-NEXT:    flw fs1, 84(a1)
-; ILP32D-NEXT:    flw fs2, 88(a1)
-; ILP32D-NEXT:    flw fs3, 92(a1)
-; ILP32D-NEXT:    flw fs4, 96(a1)
-; ILP32D-NEXT:    flw fs5, 100(a1)
-; ILP32D-NEXT:    flw fs6, 104(a1)
-; ILP32D-NEXT:    flw fs7, 108(a1)
+; ILP32D-NEXT:    flw fa4, 16(a1)
+; ILP32D-NEXT:    flw fa3, 20(a1)
+; ILP32D-NEXT:    flw fa2, 24(a1)
+; ILP32D-NEXT:    flw fa1, 28(a1)
+; ILP32D-NEXT:    flw fa0, 32(a1)
+; ILP32D-NEXT:    flw ft0, 36(a1)
+; ILP32D-NEXT:    flw ft1, 40(a1)
+; ILP32D-NEXT:    flw ft2, 44(a1)
+; ILP32D-NEXT:    flw ft3, 48(a1)
+; ILP32D-NEXT:    flw ft4, 52(a1)
+; ILP32D-NEXT:    flw ft5, 56(a1)
+; ILP32D-NEXT:    flw ft6, 60(a1)
+; ILP32D-NEXT:    flw ft7, 64(a1)
+; ILP32D-NEXT:    flw fa6, 68(a1)
+; ILP32D-NEXT:    flw fa7, 72(a1)
+; ILP32D-NEXT:    flw ft8, 76(a1)
+; ILP32D-NEXT:    flw ft9, 80(a1)
+; ILP32D-NEXT:    flw ft10, 84(a1)
+; ILP32D-NEXT:    flw ft11, 88(a1)
+; ILP32D-NEXT:    flw fs0, 92(a1)
+; ILP32D-NEXT:    flw fs1, 96(a1)
+; ILP32D-NEXT:    flw fs2, 100(a1)
+; ILP32D-NEXT:    flw fs3, 104(a1)
+; ILP32D-NEXT:    flw fs4, 108(a1)
+; ILP32D-NEXT:    flw fs5, 112(a1)
+; ILP32D-NEXT:    flw fs6, 116(a1)
+; ILP32D-NEXT:    flw fs7, 120(a1)
 ; ILP32D-NEXT:    flw fs8, 124(a1)
-; ILP32D-NEXT:    flw fs9, 120(a1)
-; ILP32D-NEXT:    flw fs10, 116(a1)
-; ILP32D-NEXT:    flw fs11, 112(a1)
+; ILP32D-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32D-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32D-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32D-NEXT:    fsw fs8, 124(a1)
-; ILP32D-NEXT:    fsw fs9, 120(a1)
-; ILP32D-NEXT:    fsw fs10, 116(a1)
-; ILP32D-NEXT:    fsw fs11, 112(a1)
-; ILP32D-NEXT:    fsw fs7, 108(a1)
-; ILP32D-NEXT:    fsw fs6, 104(a1)
-; ILP32D-NEXT:    fsw fs5, 100(a1)
-; ILP32D-NEXT:    fsw fs4, 96(a1)
-; ILP32D-NEXT:    fsw fs3, 92(a1)
-; ILP32D-NEXT:    fsw fs2, 88(a1)
-; ILP32D-NEXT:    fsw fs1, 84(a1)
-; ILP32D-NEXT:    fsw fs0, 80(a1)
-; ILP32D-NEXT:    fsw ft11, 76(a1)
-; ILP32D-NEXT:    fsw ft10, 72(a1)
-; ILP32D-NEXT:    fsw ft9, 68(a1)
-; ILP32D-NEXT:    fsw ft8, 64(a1)
-; ILP32D-NEXT:    fsw fa7, 60(a1)
-; ILP32D-NEXT:    fsw fa6, 56(a1)
-; ILP32D-NEXT:    fsw ft7, 52(a1)
-; ILP32D-NEXT:    fsw ft6, 48(a1)
-; ILP32D-NEXT:    fsw ft5, 44(a1)
-; ILP32D-NEXT:    fsw ft4, 40(a1)
-; ILP32D-NEXT:    fsw ft3, 36(a1)
-; ILP32D-NEXT:    fsw ft2, 32(a1)
-; ILP32D-NEXT:    fsw ft1, 28(a1)
-; ILP32D-NEXT:    fsw ft0, 24(a1)
-; ILP32D-NEXT:    fsw fa0, 20(a1)
-; ILP32D-NEXT:    fsw fa1, 16(a1)
-; ILP32D-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32D-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32D-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32D-NEXT:    fsw fs7, 120(a1)
+; ILP32D-NEXT:    fsw fs6, 116(a1)
+; ILP32D-NEXT:    fsw fs5, 112(a1)
+; ILP32D-NEXT:    fsw fs4, 108(a1)
+; ILP32D-NEXT:    fsw fs3, 104(a1)
+; ILP32D-NEXT:    fsw fs2, 100(a1)
+; ILP32D-NEXT:    fsw fs1, 96(a1)
+; ILP32D-NEXT:    fsw fs0, 92(a1)
+; ILP32D-NEXT:    fsw ft11, 88(a1)
+; ILP32D-NEXT:    fsw ft10, 84(a1)
+; ILP32D-NEXT:    fsw ft9, 80(a1)
+; ILP32D-NEXT:    fsw ft8, 76(a1)
+; ILP32D-NEXT:    fsw fa7, 72(a1)
+; ILP32D-NEXT:    fsw fa6, 68(a1)
+; ILP32D-NEXT:    fsw ft7, 64(a1)
+; ILP32D-NEXT:    fsw ft6, 60(a1)
+; ILP32D-NEXT:    fsw ft5, 56(a1)
+; ILP32D-NEXT:    fsw ft4, 52(a1)
+; ILP32D-NEXT:    fsw ft3, 48(a1)
+; ILP32D-NEXT:    fsw ft2, 44(a1)
+; ILP32D-NEXT:    fsw ft1, 40(a1)
+; ILP32D-NEXT:    fsw ft0, 36(a1)
+; ILP32D-NEXT:    fsw fa0, 32(a1)
+; ILP32D-NEXT:    fsw fa1, 28(a1)
+; ILP32D-NEXT:    fsw fa2, 24(a1)
+; ILP32D-NEXT:    fsw fa3, 20(a1)
+; ILP32D-NEXT:    fsw fa4, 16(a1)
+; ILP32D-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32D-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32D-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
@@ -466,69 +466,69 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    lui a0, %hi(var)
 ; LP64D-NEXT:    flw fa5, %lo(var)(a0)
-; LP64D-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64D-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64D-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64D-NEXT:    addi a1, a0, %lo(var)
-; LP64D-NEXT:    flw fa1, 16(a1)
-; LP64D-NEXT:    flw fa0, 20(a1)
-; LP64D-NEXT:    flw ft0, 24(a1)
-; LP64D-NEXT:    flw ft1, 28(a1)
-; LP64D-NEXT:    flw ft2, 32(a1)
-; LP64D-NEXT:    flw ft3, 36(a1)
-; LP64D-NEXT:    flw ft4, 40(a1)
-; LP64D-NEXT:    flw ft5, 44(a1)
-; LP64D-NEXT:    flw ft6, 48(a1)
-; LP64D-NEXT:    flw ft7, 52(a1)
-; LP64D-NEXT:    flw fa6, 56(a1)
-; LP64D-NEXT:    flw fa7, 60(a1)
-; LP64D-NEXT:    flw ft8, 64(a1)
-; LP64D-NEXT:    flw ft9, 68(a1)
-; LP64D-NEXT:    flw ft10, 72(a1)
-; LP64D-NEXT:    flw ft11, 76(a1)
-; LP64D-NEXT:    flw fs0, 80(a1)
-; LP64D-NEXT:    flw fs1, 84(a1)
-; LP64D-NEXT:    flw fs2, 88(a1)
-; LP64D-NEXT:    flw fs3, 92(a1)
-; LP64D-NEXT:    flw fs4, 96(a1)
-; LP64D-NEXT:    flw fs5, 100(a1)
-; LP64D-NEXT:    flw fs6, 104(a1)
-; LP64D-NEXT:    flw fs7, 108(a1)
+; LP64D-NEXT:    flw fa4, 16(a1)
+; LP64D-NEXT:    flw fa3, 20(a1)
+; LP64D-NEXT:    flw fa2, 24(a1)
+; LP64D-NEXT:    flw fa1, 28(a1)
+; LP64D-NEXT:    flw fa0, 32(a1)
+; LP64D-NEXT:    flw ft0, 36(a1)
+; LP64D-NEXT:    flw ft1, 40(a1)
+; LP64D-NEXT:    flw ft2, 44(a1)
+; LP64D-NEXT:    flw ft3, 48(a1)
+; LP64D-NEXT:    flw ft4, 52(a1)
+; LP64D-NEXT:    flw ft5, 56(a1)
+; LP64D-NEXT:    flw ft6, 60(a1)
+; LP64D-NEXT:    flw ft7, 64(a1)
+; LP64D-NEXT:    flw fa6, 68(a1)
+; LP64D-NEXT:    flw fa7, 72(a1)
+; LP64D-NEXT:    flw ft8, 76(a1)
+; LP64D-NEXT:    flw ft9, 80(a1)
+; LP64D-NEXT:    flw ft10, 84(a1)
+; LP64D-NEXT:    flw ft11, 88(a1)
+; LP64D-NEXT:    flw fs0, 92(a1)
+; LP64D-NEXT:    flw fs1, 96(a1)
+; LP64D-NEXT:    flw fs2, 100(a1)
+; LP64D-NEXT:    flw fs3, 104(a1)
+; LP64D-NEXT:    flw fs4, 108(a1)
+; LP64D-NEXT:    flw fs5, 112(a1)
+; LP64D-NEXT:    flw fs6, 116(a1)
+; LP64D-NEXT:    flw fs7, 120(a1)
 ; LP64D-NEXT:    flw fs8, 124(a1)
-; LP64D-NEXT:    flw fs9, 120(a1)
-; LP64D-NEXT:    flw fs10, 116(a1)
-; LP64D-NEXT:    flw fs11, 112(a1)
+; LP64D-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64D-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64D-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64D-NEXT:    fsw fs8, 124(a1)
-; LP64D-NEXT:    fsw fs9, 120(a1)
-; LP64D-NEXT:    fsw fs10, 116(a1)
-; LP64D-NEXT:    fsw fs11, 112(a1)
-; LP64D-NEXT:    fsw fs7, 108(a1)
-; LP64D-NEXT:    fsw fs6, 104(a1)
-; LP64D-NEXT:    fsw fs5, 100(a1)
-; LP64D-NEXT:    fsw fs4, 96(a1)
-; LP64D-NEXT:    fsw fs3, 92(a1)
-; LP64D-NEXT:    fsw fs2, 88(a1)
-; LP64D-NEXT:    fsw fs1, 84(a1)
-; LP64D-NEXT:    fsw fs0, 80(a1)
-; LP64D-NEXT:    fsw ft11, 76(a1)
-; LP64D-NEXT:    fsw ft10, 72(a1)
-; LP64D-NEXT:    fsw ft9, 68(a1)
-; LP64D-NEXT:    fsw ft8, 64(a1)
-; LP64D-NEXT:    fsw fa7, 60(a1)
-; LP64D-NEXT:    fsw fa6, 56(a1)
-; LP64D-NEXT:    fsw ft7, 52(a1)
-; LP64D-NEXT:    fsw ft6, 48(a1)
-; LP64D-NEXT:    fsw ft5, 44(a1)
-; LP64D-NEXT:    fsw ft4, 40(a1)
-; LP64D-NEXT:    fsw ft3, 36(a1)
-; LP64D-NEXT:    fsw ft2, 32(a1)
-; LP64D-NEXT:    fsw ft1, 28(a1)
-; LP64D-NEXT:    fsw ft0, 24(a1)
-; LP64D-NEXT:    fsw fa0, 20(a1)
-; LP64D-NEXT:    fsw fa1, 16(a1)
-; LP64D-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64D-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64D-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64D-NEXT:    fsw fs7, 120(a1)
+; LP64D-NEXT:    fsw fs6, 116(a1)
+; LP64D-NEXT:    fsw fs5, 112(a1)
+; LP64D-NEXT:    fsw fs4, 108(a1)
+; LP64D-NEXT:    fsw fs3, 104(a1)
+; LP64D-NEXT:    fsw fs2, 100(a1)
+; LP64D-NEXT:    fsw fs1, 96(a1)
+; LP64D-NEXT:    fsw fs0, 92(a1)
+; LP64D-NEXT:    fsw ft11, 88(a1)
+; LP64D-NEXT:    fsw ft10, 84(a1)
+; LP64D-NEXT:    fsw ft9, 80(a1)
+; LP64D-NEXT:    fsw ft8, 76(a1)
+; LP64D-NEXT:    fsw fa7, 72(a1)
+; LP64D-NEXT:    fsw fa6, 68(a1)
+; LP64D-NEXT:    fsw ft7, 64(a1)
+; LP64D-NEXT:    fsw ft6, 60(a1)
+; LP64D-NEXT:    fsw ft5, 56(a1)
+; LP64D-NEXT:    fsw ft4, 52(a1)
+; LP64D-NEXT:    fsw ft3, 48(a1)
+; LP64D-NEXT:    fsw ft2, 44(a1)
+; LP64D-NEXT:    fsw ft1, 40(a1)
+; LP64D-NEXT:    fsw ft0, 36(a1)
+; LP64D-NEXT:    fsw fa0, 32(a1)
+; LP64D-NEXT:    fsw fa1, 28(a1)
+; LP64D-NEXT:    fsw fa2, 24(a1)
+; LP64D-NEXT:    fsw fa3, 20(a1)
+; LP64D-NEXT:    fsw fa4, 16(a1)
+; LP64D-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64D-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64D-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64D-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
index 40076316bca89..a7f582f5f0699 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
@@ -20,141 +20,141 @@ define void @callee() nounwind {
 ; ILP32-LABEL: callee:
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
-; ILP32-NEXT:    fld fa5, %lo(var)(a0)
-; ILP32-NEXT:    fld fa4, %lo(var+8)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    fld fa3, 16(a1)
-; ILP32-NEXT:    fld fa2, 24(a1)
-; ILP32-NEXT:    fld fa1, 32(a1)
-; ILP32-NEXT:    fld fa0, 40(a1)
-; ILP32-NEXT:    fld ft0, 48(a1)
-; ILP32-NEXT:    fld ft1, 56(a1)
-; ILP32-NEXT:    fld ft2, 64(a1)
-; ILP32-NEXT:    fld ft3, 72(a1)
-; ILP32-NEXT:    fld ft4, 80(a1)
-; ILP32-NEXT:    fld ft5, 88(a1)
-; ILP32-NEXT:    fld ft6, 96(a1)
-; ILP32-NEXT:    fld ft7, 104(a1)
-; ILP32-NEXT:    fld fa6, 112(a1)
-; ILP32-NEXT:    fld fa7, 120(a1)
-; ILP32-NEXT:    fld ft8, 128(a1)
-; ILP32-NEXT:    fld ft9, 136(a1)
-; ILP32-NEXT:    fld ft10, 144(a1)
-; ILP32-NEXT:    fld ft11, 152(a1)
-; ILP32-NEXT:    fld fs0, 160(a1)
-; ILP32-NEXT:    fld fs1, 168(a1)
-; ILP32-NEXT:    fld fs2, 176(a1)
-; ILP32-NEXT:    fld fs3, 184(a1)
-; ILP32-NEXT:    fld fs4, 192(a1)
-; ILP32-NEXT:    fld fs5, 200(a1)
-; ILP32-NEXT:    fld fs6, 208(a1)
-; ILP32-NEXT:    fld fs7, 216(a1)
-; ILP32-NEXT:    fld fs8, 248(a1)
+; ILP32-NEXT:    fld fa5, 248(a1)
+; ILP32-NEXT:    fld fa4, 16(a1)
+; ILP32-NEXT:    fld fa3, 24(a1)
+; ILP32-NEXT:    fld fa2, 32(a1)
+; ILP32-NEXT:    fld fa1, 40(a1)
+; ILP32-NEXT:    fld fa0, 48(a1)
+; ILP32-NEXT:    fld ft0, 56(a1)
+; ILP32-NEXT:    fld ft1, 64(a1)
+; ILP32-NEXT:    fld ft2, 72(a1)
+; ILP32-NEXT:    fld ft3, 80(a1)
+; ILP32-NEXT:    fld ft4, 88(a1)
+; ILP32-NEXT:    fld ft5, 96(a1)
+; ILP32-NEXT:    fld ft6, 104(a1)
+; ILP32-NEXT:    fld ft7, 112(a1)
+; ILP32-NEXT:    fld fa6, 120(a1)
+; ILP32-NEXT:    fld fa7, 128(a1)
+; ILP32-NEXT:    fld ft8, 136(a1)
+; ILP32-NEXT:    fld ft9, 144(a1)
+; ILP32-NEXT:    fld ft10, 152(a1)
+; ILP32-NEXT:    fld ft11, 160(a1)
+; ILP32-NEXT:    fld fs0, 168(a1)
+; ILP32-NEXT:    fld fs1, 176(a1)
+; ILP32-NEXT:    fld fs2, 184(a1)
+; ILP32-NEXT:    fld fs3, 192(a1)
+; ILP32-NEXT:    fld fs4, 200(a1)
+; ILP32-NEXT:    fld fs5, 208(a1)
+; ILP32-NEXT:    fld fs6, 216(a1)
+; ILP32-NEXT:    fld fs7, 224(a1)
+; ILP32-NEXT:    fld fs8, 232(a1)
 ; ILP32-NEXT:    fld fs9, 240(a1)
-; ILP32-NEXT:    fld fs10, 232(a1)
-; ILP32-NEXT:    fld fs11, 224(a1)
-; ILP32-NEXT:    fsd fs8, 248(a1)
+; ILP32-NEXT:    fld fs10, %lo(var)(a0)
+; ILP32-NEXT:    fld fs11, %lo(var+8)(a0)
+; ILP32-NEXT:    fsd fa5, 248(a1)
 ; ILP32-NEXT:    fsd fs9, 240(a1)
-; ILP32-NEXT:    fsd fs10, 232(a1)
-; ILP32-NEXT:    fsd fs11, 224(a1)
-; ILP32-NEXT:    fsd fs7, 216(a1)
-; ILP32-NEXT:    fsd fs6, 208(a1)
-; ILP32-NEXT:    fsd fs5, 200(a1)
-; ILP32-NEXT:    fsd fs4, 192(a1)
-; ILP32-NEXT:    fsd fs3, 184(a1)
-; ILP32-NEXT:    fsd fs2, 176(a1)
-; ILP32-NEXT:    fsd fs1, 168(a1)
-; ILP32-NEXT:    fsd fs0, 160(a1)
-; ILP32-NEXT:    fsd ft11, 152(a1)
-; ILP32-NEXT:    fsd ft10, 144(a1)
-; ILP32-NEXT:    fsd ft9, 136(a1)
-; ILP32-NEXT:    fsd ft8, 128(a1)
-; ILP32-NEXT:    fsd fa7, 120(a1)
-; ILP32-NEXT:    fsd fa6, 112(a1)
-; ILP32-NEXT:    fsd ft7, 104(a1)
-; ILP32-NEXT:    fsd ft6, 96(a1)
-; ILP32-NEXT:    fsd ft5, 88(a1)
-; ILP32-NEXT:    fsd ft4, 80(a1)
-; ILP32-NEXT:    fsd ft3, 72(a1)
-; ILP32-NEXT:    fsd ft2, 64(a1)
-; ILP32-NEXT:    fsd ft1, 56(a1)
-; ILP32-NEXT:    fsd ft0, 48(a1)
-; ILP32-NEXT:    fsd fa0, 40(a1)
-; ILP32-NEXT:    fsd fa1, 32(a1)
-; ILP32-NEXT:    fsd fa2, 24(a1)
-; ILP32-NEXT:    fsd fa3, 16(a1)
-; ILP32-NEXT:    fsd fa4, %lo(var+8)(a0)
-; ILP32-NEXT:    fsd fa5, %lo(var)(a0)
+; ILP32-NEXT:    fsd fs8, 232(a1)
+; ILP32-NEXT:    fsd fs7, 224(a1)
+; ILP32-NEXT:    fsd fs6, 216(a1)
+; ILP32-NEXT:    fsd fs5, 208(a1)
+; ILP32-NEXT:    fsd fs4, 200(a1)
+; ILP32-NEXT:    fsd fs3, 192(a1)
+; ILP32-NEXT:    fsd fs2, 184(a1)
+; ILP32-NEXT:    fsd fs1, 176(a1)
+; ILP32-NEXT:    fsd fs0, 168(a1)
+; ILP32-NEXT:    fsd ft11, 160(a1)
+; ILP32-NEXT:    fsd ft10, 152(a1)
+; ILP32-NEXT:    fsd ft9, 144(a1)
+; ILP32-NEXT:    fsd ft8, 136(a1)
+; ILP32-NEXT:    fsd fa7, 128(a1)
+; ILP32-NEXT:    fsd fa6, 120(a1)
+; ILP32-NEXT:    fsd ft7, 112(a1)
+; ILP32-NEXT:    fsd ft6, 104(a1)
+; ILP32-NEXT:    fsd ft5, 96(a1)
+; ILP32-NEXT:    fsd ft4, 88(a1)
+; ILP32-NEXT:    fsd ft3, 80(a1)
+; ILP32-NEXT:    fsd ft2, 72(a1)
+; ILP32-NEXT:    fsd ft1, 64(a1)
+; ILP32-NEXT:    fsd ft0, 56(a1)
+; ILP32-NEXT:    fsd fa0, 48(a1)
+; ILP32-NEXT:    fsd fa1, 40(a1)
+; ILP32-NEXT:    fsd fa2, 32(a1)
+; ILP32-NEXT:    fsd fa3, 24(a1)
+; ILP32-NEXT:    fsd fa4, 16(a1)
+; ILP32-NEXT:    fsd fs11, %lo(var+8)(a0)
+; ILP32-NEXT:    fsd fs10, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
 ; LP64-LABEL: callee:
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
-; LP64-NEXT:    fld fa5, %lo(var)(a0)
-; LP64-NEXT:    fld fa4, %lo(var+8)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    fld fa3, 16(a1)
-; LP64-NEXT:    fld fa2, 24(a1)
-; LP64-NEXT:    fld fa1, 32(a1)
-; LP64-NEXT:    fld fa0, 40(a1)
-; LP64-NEXT:    fld ft0, 48(a1)
-; LP64-NEXT:    fld ft1, 56(a1)
-; LP64-NEXT:    fld ft2, 64(a1)
-; LP64-NEXT:    fld ft3, 72(a1)
-; LP64-NEXT:    fld ft4, 80(a1)
-; LP64-NEXT:    fld ft5, 88(a1)
-; LP64-NEXT:    fld ft6, 96(a1)
-; LP64-NEXT:    fld ft7, 104(a1)
-; LP64-NEXT:    fld fa6, 112(a1)
-; LP64-NEXT:    fld fa7, 120(a1)
-; LP64-NEXT:    fld ft8, 128(a1)
-; LP64-NEXT:    fld ft9, 136(a1)
-; LP64-NEXT:    fld ft10, 144(a1)
-; LP64-NEXT:    fld ft11, 152(a1)
-; LP64-NEXT:    fld fs0, 160(a1)
-; LP64-NEXT:    fld fs1, 168(a1)
-; LP64-NEXT:    fld fs2, 176(a1)
-; LP64-NEXT:    fld fs3, 184(a1)
-; LP64-NEXT:    fld fs4, 192(a1)
-; LP64-NEXT:    fld fs5, 200(a1)
-; LP64-NEXT:    fld fs6, 208(a1)
-; LP64-NEXT:    fld fs7, 216(a1)
-; LP64-NEXT:    fld fs8, 248(a1)
+; LP64-NEXT:    fld fa5, 248(a1)
+; LP64-NEXT:    fld fa4, 16(a1)
+; LP64-NEXT:    fld fa3, 24(a1)
+; LP64-NEXT:    fld fa2, 32(a1)
+; LP64-NEXT:    fld fa1, 40(a1)
+; LP64-NEXT:    fld fa0, 48(a1)
+; LP64-NEXT:    fld ft0, 56(a1)
+; LP64-NEXT:    fld ft1, 64(a1)
+; LP64-NEXT:    fld ft2, 72(a1)
+; LP64-NEXT:    fld ft3, 80(a1)
+; LP64-NEXT:    fld ft4, 88(a1)
+; LP64-NEXT:    fld ft5, 96(a1)
+; LP64-NEXT:    fld ft6, 104(a1)
+; LP64-NEXT:    fld ft7, 112(a1)
+; LP64-NEXT:    fld fa6, 120(a1)
+; LP64-NEXT:    fld fa7, 128(a1)
+; LP64-NEXT:    fld ft8, 136(a1)
+; LP64-NEXT:    fld ft9, 144(a1)
+; LP64-NEXT:    fld ft10, 152(a1)
+; LP64-NEXT:    fld ft11, 160(a1)
+; LP64-NEXT:    fld fs0, 168(a1)
+; LP64-NEXT:    fld fs1, 176(a1)
+; LP64-NEXT:    fld fs2, 184(a1)
+; LP64-NEXT:    fld fs3, 192(a1)
+; LP64-NEXT:    fld fs4, 200(a1)
+; LP64-NEXT:    fld fs5, 208(a1)
+; LP64-NEXT:    fld fs6, 216(a1)
+; LP64-NEXT:    fld fs7, 224(a1)
+; LP64-NEXT:    fld fs8, 232(a1)
 ; LP64-NEXT:    fld fs9, 240(a1)
-; LP64-NEXT:    fld fs10, 232(a1)
-; LP64-NEXT:    fld fs11, 224(a1)
-; LP64-NEXT:    fsd fs8, 248(a1)
+; LP64-NEXT:    fld fs10, %lo(var)(a0)
+; LP64-NEXT:    fld fs11, %lo(var+8)(a0)
+; LP64-NEXT:    fsd fa5, 248(a1)
 ; LP64-NEXT:    fsd fs9, 240(a1)
-; LP64-NEXT:    fsd fs10, 232(a1)
-; LP64-NEXT:    fsd fs11, 224(a1)
-; LP64-NEXT:    fsd fs7, 216(a1)
-; LP64-NEXT:    fsd fs6, 208(a1)
-; LP64-NEXT:    fsd fs5, 200(a1)
-; LP64-NEXT:    fsd fs4, 192(a1)
-; LP64-NEXT:    fsd fs3, 184(a1)
-; LP64-NEXT:    fsd fs2, 176(a1)
-; LP64-NEXT:    fsd fs1, 168(a1)
-; LP64-NEXT:    fsd fs0, 160(a1)
-; LP64-NEXT:    fsd ft11, 152(a1)
-; LP64-NEXT:    fsd ft10, 144(a1)
-; LP64-NEXT:    fsd ft9, 136(a1)
-; LP64-NEXT:    fsd ft8, 128(a1)
-; LP64-NEXT:    fsd fa7, 120(a1)
-; LP64-NEXT:    fsd fa6, 112(a1)
-; LP64-NEXT:    fsd ft7, 104(a1)
-; LP64-NEXT:    fsd ft6, 96(a1)
-; LP64-NEXT:    fsd ft5, 88(a1)
-; LP64-NEXT:    fsd ft4, 80(a1)
-; LP64-NEXT:    fsd ft3, 72(a1)
-; LP64-NEXT:    fsd ft2, 64(a1)
-; LP64-NEXT:    fsd ft1, 56(a1)
-; LP64-NEXT:    fsd ft0, 48(a1)
-; LP64-NEXT:    fsd fa0, 40(a1)
-; LP64-NEXT:    fsd fa1, 32(a1)
-; LP64-NEXT:    fsd fa2, 24(a1)
-; LP64-NEXT:    fsd fa3, 16(a1)
-; LP64-NEXT:    fsd fa4, %lo(var+8)(a0)
-; LP64-NEXT:    fsd fa5, %lo(var)(a0)
+; LP64-NEXT:    fsd fs8, 232(a1)
+; LP64-NEXT:    fsd fs7, 224(a1)
+; LP64-NEXT:    fsd fs6, 216(a1)
+; LP64-NEXT:    fsd fs5, 208(a1)
+; LP64-NEXT:    fsd fs4, 200(a1)
+; LP64-NEXT:    fsd fs3, 192(a1)
+; LP64-NEXT:    fsd fs2, 184(a1)
+; LP64-NEXT:    fsd fs1, 176(a1)
+; LP64-NEXT:    fsd fs0, 168(a1)
+; LP64-NEXT:    fsd ft11, 160(a1)
+; LP64-NEXT:    fsd ft10, 152(a1)
+; LP64-NEXT:    fsd ft9, 144(a1)
+; LP64-NEXT:    fsd ft8, 136(a1)
+; LP64-NEXT:    fsd fa7, 128(a1)
+; LP64-NEXT:    fsd fa6, 120(a1)
+; LP64-NEXT:    fsd ft7, 112(a1)
+; LP64-NEXT:    fsd ft6, 104(a1)
+; LP64-NEXT:    fsd ft5, 96(a1)
+; LP64-NEXT:    fsd ft4, 88(a1)
+; LP64-NEXT:    fsd ft3, 80(a1)
+; LP64-NEXT:    fsd ft2, 72(a1)
+; LP64-NEXT:    fsd ft1, 64(a1)
+; LP64-NEXT:    fsd ft0, 56(a1)
+; LP64-NEXT:    fsd fa0, 48(a1)
+; LP64-NEXT:    fsd fa1, 40(a1)
+; LP64-NEXT:    fsd fa2, 32(a1)
+; LP64-NEXT:    fsd fa3, 24(a1)
+; LP64-NEXT:    fsd fa4, 16(a1)
+; LP64-NEXT:    fsd fs11, %lo(var+8)(a0)
+; LP64-NEXT:    fsd fs10, %lo(var)(a0)
 ; LP64-NEXT:    ret
 ;
 ; ILP32D-LABEL: callee:
@@ -173,71 +173,71 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    lui a0, %hi(var)
-; ILP32D-NEXT:    fld fa5, %lo(var)(a0)
-; ILP32D-NEXT:    fld fa4, %lo(var+8)(a0)
 ; ILP32D-NEXT:    addi a1, a0, %lo(var)
-; ILP32D-NEXT:    fld fa3, 16(a1)
-; ILP32D-NEXT:    fld fa2, 24(a1)
-; ILP32D-NEXT:    fld fa1, 32(a1)
-; ILP32D-NEXT:    fld fa0, 40(a1)
-; ILP32D-NEXT:    fld ft0, 48(a1)
-; ILP32D-NEXT:    fld ft1, 56(a1)
-; ILP32D-NEXT:    fld ft2, 64(a1)
-; ILP32D-NEXT:    fld ft3, 72(a1)
-; ILP32D-NEXT:    fld ft4, 80(a1)
-; ILP32D-NEXT:    fld ft5, 88(a1)
-; ILP32D-NEXT:    fld ft6, 96(a1)
-; ILP32D-NEXT:    fld ft7, 104(a1)
-; ILP32D-NEXT:    fld fa6, 112(a1)
-; ILP32D-NEXT:    fld fa7, 120(a1)
-; ILP32D-NEXT:    fld ft8, 128(a1)
-; ILP32D-NEXT:    fld ft9, 136(a1)
-; ILP32D-NEXT:    fld ft10, 144(a1)
-; ILP32D-NEXT:    fld ft11, 152(a1)
-; ILP32D-NEXT:    fld fs0, 160(a1)
-; ILP32D-NEXT:    fld fs1, 168(a1)
-; ILP32D-NEXT:    fld fs2, 176(a1)
-; ILP32D-NEXT:    fld fs3, 184(a1)
-; ILP32D-NEXT:    fld fs4, 192(a1)
-; ILP32D-NEXT:    fld fs5, 200(a1)
-; ILP32D-NEXT:    fld fs6, 208(a1)
-; ILP32D-NEXT:    fld fs7, 216(a1)
-; ILP32D-NEXT:    fld fs8, 248(a1)
+; ILP32D-NEXT:    fld fa5, 248(a1)
+; ILP32D-NEXT:    fld fa4, 16(a1)
+; ILP32D-NEXT:    fld fa3, 24(a1)
+; ILP32D-NEXT:    fld fa2, 32(a1)
+; ILP32D-NEXT:    fld fa1, 40(a1)
+; ILP32D-NEXT:    fld fa0, 48(a1)
+; ILP32D-NEXT:    fld ft0, 56(a1)
+; ILP32D-NEXT:    fld ft1, 64(a1)
+; ILP32D-NEXT:    fld ft2, 72(a1)
+; ILP32D-NEXT:    fld ft3, 80(a1)
+; ILP32D-NEXT:    fld ft4, 88(a1)
+; ILP32D-NEXT:    fld ft5, 96(a1)
+; ILP32D-NEXT:    fld ft6, 104(a1)
+; ILP32D-NEXT:    fld ft7, 112(a1)
+; ILP32D-NEXT:    fld fa6, 120(a1)
+; ILP32D-NEXT:    fld fa7, 128(a1)
+; ILP32D-NEXT:    fld ft8, 136(a1)
+; ILP32D-NEXT:    fld ft9, 144(a1)
+; ILP32D-NEXT:    fld ft10, 152(a1)
+; ILP32D-NEXT:    fld ft11, 160(a1)
+; ILP32D-NEXT:    fld fs0, 168(a1)
+; ILP32D-NEXT:    fld fs1, 176(a1)
+; ILP32D-NEXT:    fld fs2, 184(a1)
+; ILP32D-NEXT:    fld fs3, 192(a1)
+; ILP32D-NEXT:    fld fs4, 200(a1)
+; ILP32D-NEXT:    fld fs5, 208(a1)
+; ILP32D-NEXT:    fld fs6, 216(a1)
+; ILP32D-NEXT:    fld fs7, 224(a1)
+; ILP32D-NEXT:    fld fs8, 232(a1)
 ; ILP32D-NEXT:    fld fs9, 240(a1)
-; ILP32D-NEXT:    fld fs10, 232(a1)
-; ILP32D-NEXT:    fld fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs8, 248(a1)
+; ILP32D-NEXT:    fld fs10, %lo(var)(a0)
+; ILP32D-NEXT:    fld fs11, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsd fa5, 248(a1)
 ; ILP32D-NEXT:    fsd fs9, 240(a1)
-; ILP32D-NEXT:    fsd fs10, 232(a1)
-; ILP32D-NEXT:    fsd fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs7, 216(a1)
-; ILP32D-NEXT:    fsd fs6, 208(a1)
-; ILP32D-NEXT:    fsd fs5, 200(a1)
-; ILP32D-NEXT:    fsd fs4, 192(a1)
-; ILP32D-NEXT:    fsd fs3, 184(a1)
-; ILP32D-NEXT:    fsd fs2, 176(a1)
-; ILP32D-NEXT:    fsd fs1, 168(a1)
-; ILP32D-NEXT:    fsd fs0, 160(a1)
-; ILP32D-NEXT:    fsd ft11, 152(a1)
-; ILP32D-NEXT:    fsd ft10, 144(a1)
-; ILP32D-NEXT:    fsd ft9, 136(a1)
-; ILP32D-NEXT:    fsd ft8, 128(a1)
-; ILP32D-NEXT:    fsd fa7, 120(a1)
-; ILP32D-NEXT:    fsd fa6, 112(a1)
-; ILP32D-NEXT:    fsd ft7, 104(a1)
-; ILP32D-NEXT:    fsd ft6, 96(a1)
-; ILP32D-NEXT:    fsd ft5, 88(a1)
-; ILP32D-NEXT:    fsd ft4, 80(a1)
-; ILP32D-NEXT:    fsd ft3, 72(a1)
-; ILP32D-NEXT:    fsd ft2, 64(a1)
-; ILP32D-NEXT:    fsd ft1, 56(a1)
-; ILP32D-NEXT:    fsd ft0, 48(a1)
-; ILP32D-NEXT:    fsd fa0, 40(a1)
-; ILP32D-NEXT:    fsd fa1, 32(a1)
-; ILP32D-NEXT:    fsd fa2, 24(a1)
-; ILP32D-NEXT:    fsd fa3, 16(a1)
-; ILP32D-NEXT:    fsd fa4, %lo(var+8)(a0)
-; ILP32D-NEXT:    fsd fa5, %lo(var)(a0)
+; ILP32D-NEXT:    fsd fs8, 232(a1)
+; ILP32D-NEXT:    fsd fs7, 224(a1)
+; ILP32D-NEXT:    fsd fs6, 216(a1)
+; ILP32D-NEXT:    fsd fs5, 208(a1)
+; ILP32D-NEXT:    fsd fs4, 200(a1)
+; ILP32D-NEXT:    fsd fs3, 192(a1)
+; ILP32D-NEXT:    fsd fs2, 184(a1)
+; ILP32D-NEXT:    fsd fs1, 176(a1)
+; ILP32D-NEXT:    fsd fs0, 168(a1)
+; ILP32D-NEXT:    fsd ft11, 160(a1)
+; ILP32D-NEXT:    fsd ft10, 152(a1)
+; ILP32D-NEXT:    fsd ft9, 144(a1)
+; ILP32D-NEXT:    fsd ft8, 136(a1)
+; ILP32D-NEXT:    fsd fa7, 128(a1)
+; ILP32D-NEXT:    fsd fa6, 120(a1)
+; ILP32D-NEXT:    fsd ft7, 112(a1)
+; ILP32D-NEXT:    fsd ft6, 104(a1)
+; ILP32D-NEXT:    fsd ft5, 96(a1)
+; ILP32D-NEXT:    fsd ft4, 88(a1)
+; ILP32D-NEXT:    fsd ft3, 80(a1)
+; ILP32D-NEXT:    fsd ft2, 72(a1)
+; ILP32D-NEXT:    fsd ft1, 64(a1)
+; ILP32D-NEXT:    fsd ft0, 56(a1)
+; ILP32D-NEXT:    fsd fa0, 48(a1)
+; ILP32D-NEXT:    fsd fa1, 40(a1)
+; ILP32D-NEXT:    fsd fa2, 32(a1)
+; ILP32D-NEXT:    fsd fa3, 24(a1)
+; ILP32D-NEXT:    fsd fa4, 16(a1)
+; ILP32D-NEXT:    fsd fs11, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsd fs10, %lo(var)(a0)
 ; ILP32D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
@@ -269,71 +269,71 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    lui a0, %hi(var)
-; LP64D-NEXT:    fld fa5, %lo(var)(a0)
-; LP64D-NEXT:    fld fa4, %lo(var+8)(a0)
 ; LP64D-NEXT:    addi a1, a0, %lo(var)
-; LP64D-NEXT:    fld fa3, 16(a1)
-; LP64D-NEXT:    fld fa2, 24(a1)
-; LP64D-NEXT:    fld fa1, 32(a1)
-; LP64D-NEXT:    fld fa0, 40(a1)
-; LP64D-NEXT:    fld ft0, 48(a1)
-; LP64D-NEXT:    fld ft1, 56(a1)
-; LP64D-NEXT:    fld ft2, 64(a1)
-; LP64D-NEXT:    fld ft3, 72(a1)
-; LP64D-NEXT:    fld ft4, 80(a1)
-; LP64D-NEXT:    fld ft5, 88(a1)
-; LP64D-NEXT:    fld ft6, 96(a1)
-; LP64D-NEXT:    fld ft7, 104(a1)
-; LP64D-NEXT:    fld fa6, 112(a1)
-; LP64D-NEXT:    fld fa7, 120(a1)
-; LP64D-NEXT:    fld ft8, 128(a1)
-; LP64D-NEXT:    fld ft9, 136(a1)
-; LP64D-NEXT:    fld ft10, 144(a1)
-; LP64D-NEXT:    fld ft11, 152(a1)
-; LP64D-NEXT:    fld fs0, 160(a1)
-; LP64D-NEXT:    fld fs1, 168(a1)
-; LP64D-NEXT:    fld fs2, 176(a1)
-; LP64D-NEXT:    fld fs3, 184(a1)
-; LP64D-NEXT:    fld fs4, 192(a1)
-; LP64D-NEXT:    fld fs5, 200(a1)
-; LP64D-NEXT:    fld fs6, 208(a1)
-; LP64D-NEXT:    fld fs7, 216(a1)
-; LP64D-NEXT:    fld fs8, 248(a1)
+; LP64D-NEXT:    fld fa5, 248(a1)
+; LP64D-NEXT:    fld fa4, 16(a1)
+; LP64D-NEXT:    fld fa3, 24(a1)
+; LP64D-NEXT:    fld fa2, 32(a1)
+; LP64D-NEXT:    fld fa1, 40(a1)
+; LP64D-NEXT:    fld fa0, 48(a1)
+; LP64D-NEXT:    fld ft0, 56(a1)
+; LP64D-NEXT:    fld ft1, 64(a1)
+; LP64D-NEXT:    fld ft2, 72(a1)
+; LP64D-NEXT:    fld ft3, 80(a1)
+; LP64D-NEXT:    fld ft4, 88(a1)
+; LP64D-NEXT:    fld ft5, 96(a1)
+; LP64D-NEXT:    fld ft6, 104(a1)
+; LP64D-NEXT:    fld ft7, 112(a1)
+; LP64D-NEXT:    fld fa6, 120(a1)
+; LP64D-NEXT:    fld fa7, 128(a1)
+; LP64D-NEXT:    fld ft8, 136(a1)
+; LP64D-NEXT:    fld ft9, 144(a1)
+; LP64D-NEXT:    fld ft10, 152(a1)
+; LP64D-NEXT:    fld ft11, 160(a1)
+; LP64D-NEXT:    fld fs0, 168(a1)
+; LP64D-NEXT:    fld fs1, 176(a1)
+; LP64D-NEXT:    fld fs2, 184(a1)
+; LP64D-NEXT:    fld fs3, 192(a1)
+; LP64D-NEXT:    fld fs4, 200(a1)
+; LP64D-NEXT:    fld fs5, 208(a1)
+; LP64D-NEXT:    fld fs6, 216(a1)
+; LP64D-NEXT:    fld fs7, 224(a1)
+; LP64D-NEXT:    fld fs8, 232(a1)
 ; LP64D-NEXT:    fld fs9, 240(a1)
-; LP64D-NEXT:    fld fs10, 232(a1)
-; LP64D-NEXT:    fld fs11, 224(a1)
-; LP64D-NEXT:    fsd fs8, 248(a1)
+; LP64D-NEXT:    fld fs10, %lo(var)(a0)
+; LP64D-NEXT:    fld fs11, %lo(var+8)(a0)
+; LP64D-NEXT:    fsd fa5, 248(a1)
 ; LP64D-NEXT:    fsd fs9, 240(a1)
-; LP64D-NEXT:    fsd fs10, 232(a1)
-; LP64D-NEXT:    fsd fs11, 224(a1)
-; LP64D-NEXT:    fsd fs7, 216(a1)
-; LP64D-NEXT:    fsd fs6, 208(a1)
-; LP64D-NEXT:    fsd fs5, 200(a1)
-; LP64D-NEXT:    fsd fs4, 192(a1)
-; LP64D-NEXT:    fsd fs3, 184(a1)
-; LP64D-NEXT:    fsd fs2, 176(a1)
-; LP64D-NEXT:    fsd fs1, 168(a1)
-; LP64D-NEXT:    fsd fs0, 160(a1)
-; LP64D-NEXT:    fsd ft11, 152(a1)
-; LP64D-NEXT:    fsd ft10, 144(a1)
-; LP64D-NEXT:    fsd ft9, 136(a1)
-; LP64D-NEXT:    fsd ft8, 128(a1)
-; LP64D-NEXT:    fsd fa7, 120(a1)
-; LP64D-NEXT:    fsd fa6, 112(a1)
-; LP64D-NEXT:    fsd ft7, 104(a1)
-; LP64D-NEXT:    fsd ft6, 96(a1)
-; LP64D-NEXT:    fsd ft5, 88(a1)
-; LP64D-NEXT:    fsd ft4, 80(a1)
-; LP64D-NEXT:    fsd ft3, 72(a1)
-; LP64D-NEXT:    fsd ft2, 64(a1)
-; LP64D-NEXT:    fsd ft1, 56(a1)
-; LP64D-NEXT:    fsd ft0, 48(a1)
-; LP64D-NEXT:    fsd fa0, 40(a1)
-; LP64D-NEXT:    fsd fa1, 32(a1)
-; LP64D-NEXT:    fsd fa2, 24(a1)
-; LP64D-NEXT:    fsd fa3, 16(a1)
-; LP64D-NEXT:    fsd fa4, %lo(var+8)(a0)
-; LP64D-NEXT:    fsd fa5, %lo(var)(a0)
+; LP64D-NEXT:    fsd fs8, 232(a1)
+; LP64D-NEXT:    fsd fs7, 224(a1)
+; LP64D-NEXT:    fsd fs6, 216(a1)
+; LP64D-NEXT:    fsd fs5, 208(a1)
+; LP64D-NEXT:    fsd fs4, 200(a1)
+; LP64D-NEXT:    fsd fs3, 192(a1)
+; LP64D-NEXT:    fsd fs2, 184(a1)
+; LP64D-NEXT:    fsd fs1, 176(a1)
+; LP64D-NEXT:    fsd fs0, 168(a1)
+; LP64D-NEXT:    fsd ft11, 160(a1)
+; LP64D-NEXT:    fsd ft10, 152(a1)
+; LP64D-NEXT:    fsd ft9, 144(a1)
+; LP64D-NEXT:    fsd ft8, 136(a1)
+; LP64D-NEXT:    fsd fa7, 128(a1)
+; LP64D-NEXT:    fsd fa6, 120(a1)
+; LP64D-NEXT:    fsd ft7, 112(a1)
+; LP64D-NEXT:    fsd ft6, 104(a1)
+; LP64D-NEXT:    fsd ft5, 96(a1)
+; LP64D-NEXT:    fsd ft4, 88(a1)
+; LP64D-NEXT:    fsd ft3, 80(a1)
+; LP64D-NEXT:    fsd ft2, 72(a1)
+; LP64D-NEXT:    fsd ft1, 64(a1)
+; LP64D-NEXT:    fsd ft0, 56(a1)
+; LP64D-NEXT:    fsd fa0, 48(a1)
+; LP64D-NEXT:    fsd fa1, 40(a1)
+; LP64D-NEXT:    fsd fa2, 32(a1)
+; LP64D-NEXT:    fsd fa3, 24(a1)
+; LP64D-NEXT:    fsd fa4, 16(a1)
+; LP64D-NEXT:    fsd fs11, %lo(var+8)(a0)
+; LP64D-NEXT:    fsd fs10, %lo(var)(a0)
 ; LP64D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 09ecbbc7e8feb..a8ca5d02ff78d 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -50,84 +50,84 @@ define void @callee() nounwind {
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var)
-; RV32I-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-NEXT:    lui a4, %hi(var)
+; RV32I-NEXT:    lw a0, %lo(var)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-NEXT:    sw a0, %lo(var)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -161,86 +161,86 @@ define void @callee() nounwind {
 ; RV32I-WITH-FP-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    addi s0, sp, 80
-; RV32I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    lui a5, %hi(var)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a5)
 ; RV32I-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    addi a2, a5, %lo(var)
+; RV32I-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV32I-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -76(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -80(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw t1, 28(a5)
-; RV32I-WITH-FP-NEXT:    lw t2, 32(a5)
-; RV32I-WITH-FP-NEXT:    lw t3, 36(a5)
-; RV32I-WITH-FP-NEXT:    lw t4, 40(a5)
-; RV32I-WITH-FP-NEXT:    lw t5, 44(a5)
-; RV32I-WITH-FP-NEXT:    lw t6, 48(a5)
-; RV32I-WITH-FP-NEXT:    lw s1, 52(a5)
-; RV32I-WITH-FP-NEXT:    lw s2, 56(a5)
-; RV32I-WITH-FP-NEXT:    lw s3, 60(a5)
-; RV32I-WITH-FP-NEXT:    lw s4, 64(a5)
-; RV32I-WITH-FP-NEXT:    lw s5, 68(a5)
-; RV32I-WITH-FP-NEXT:    lw s6, 72(a5)
-; RV32I-WITH-FP-NEXT:    lw s7, 76(a5)
-; RV32I-WITH-FP-NEXT:    lw s8, 80(a5)
-; RV32I-WITH-FP-NEXT:    lw s9, 84(a5)
-; RV32I-WITH-FP-NEXT:    lw s10, 88(a5)
-; RV32I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    sw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    sw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    sw s10, 88(a5)
-; RV32I-WITH-FP-NEXT:    sw s9, 84(a5)
-; RV32I-WITH-FP-NEXT:    sw s8, 80(a5)
-; RV32I-WITH-FP-NEXT:    sw s7, 76(a5)
-; RV32I-WITH-FP-NEXT:    sw s6, 72(a5)
-; RV32I-WITH-FP-NEXT:    sw s5, 68(a5)
-; RV32I-WITH-FP-NEXT:    sw s4, 64(a5)
-; RV32I-WITH-FP-NEXT:    sw s3, 60(a5)
-; RV32I-WITH-FP-NEXT:    sw s2, 56(a5)
-; RV32I-WITH-FP-NEXT:    sw s1, 52(a5)
-; RV32I-WITH-FP-NEXT:    sw t6, 48(a5)
-; RV32I-WITH-FP-NEXT:    sw t5, 44(a5)
-; RV32I-WITH-FP-NEXT:    sw t4, 40(a5)
-; RV32I-WITH-FP-NEXT:    sw t3, 36(a5)
-; RV32I-WITH-FP-NEXT:    sw t2, 32(a5)
-; RV32I-WITH-FP-NEXT:    sw t1, 28(a5)
+; RV32I-WITH-FP-NEXT:    lw t1, 40(a2)
+; RV32I-WITH-FP-NEXT:    lw t2, 44(a2)
+; RV32I-WITH-FP-NEXT:    lw t3, 48(a2)
+; RV32I-WITH-FP-NEXT:    lw t4, 52(a2)
+; RV32I-WITH-FP-NEXT:    lw t5, 56(a2)
+; RV32I-WITH-FP-NEXT:    lw t6, 60(a2)
+; RV32I-WITH-FP-NEXT:    lw s1, 64(a2)
+; RV32I-WITH-FP-NEXT:    lw s2, 68(a2)
+; RV32I-WITH-FP-NEXT:    lw s3, 72(a2)
+; RV32I-WITH-FP-NEXT:    lw s4, 76(a2)
+; RV32I-WITH-FP-NEXT:    lw s5, 80(a2)
+; RV32I-WITH-FP-NEXT:    lw s6, 84(a2)
+; RV32I-WITH-FP-NEXT:    lw s7, 88(a2)
+; RV32I-WITH-FP-NEXT:    lw s8, 92(a2)
+; RV32I-WITH-FP-NEXT:    lw s9, 96(a2)
+; RV32I-WITH-FP-NEXT:    lw s10, 100(a2)
+; RV32I-WITH-FP-NEXT:    lw s11, 104(a2)
+; RV32I-WITH-FP-NEXT:    lw ra, 108(a2)
+; RV32I-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV32I-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV32I-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV32I-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV32I-WITH-FP-NEXT:    lw t0, %lo(var+4)(a5)
+; RV32I-WITH-FP-NEXT:    lw a7, %lo(var+8)(a5)
+; RV32I-WITH-FP-NEXT:    lw a6, %lo(var+12)(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV32I-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV32I-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV32I-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV32I-WITH-FP-NEXT:    sw ra, 108(a2)
+; RV32I-WITH-FP-NEXT:    sw s11, 104(a2)
+; RV32I-WITH-FP-NEXT:    sw s10, 100(a2)
+; RV32I-WITH-FP-NEXT:    sw s9, 96(a2)
+; RV32I-WITH-FP-NEXT:    sw s8, 92(a2)
+; RV32I-WITH-FP-NEXT:    sw s7, 88(a2)
+; RV32I-WITH-FP-NEXT:    sw s6, 84(a2)
+; RV32I-WITH-FP-NEXT:    sw s5, 80(a2)
+; RV32I-WITH-FP-NEXT:    sw s4, 76(a2)
+; RV32I-WITH-FP-NEXT:    sw s3, 72(a2)
+; RV32I-WITH-FP-NEXT:    sw s2, 68(a2)
+; RV32I-WITH-FP-NEXT:    sw s1, 64(a2)
+; RV32I-WITH-FP-NEXT:    sw t6, 60(a2)
+; RV32I-WITH-FP-NEXT:    sw t5, 56(a2)
+; RV32I-WITH-FP-NEXT:    sw t4, 52(a2)
+; RV32I-WITH-FP-NEXT:    sw t3, 48(a2)
+; RV32I-WITH-FP-NEXT:    sw t2, 44(a2)
+; RV32I-WITH-FP-NEXT:    sw t1, 40(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -80(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -76(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV32I-WITH-FP-NEXT:    sw a6, %lo(var+12)(a5)
+; RV32I-WITH-FP-NEXT:    sw a7, %lo(var+8)(a5)
+; RV32I-WITH-FP-NEXT:    sw t0, %lo(var+4)(a5)
 ; RV32I-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a5)
 ; RV32I-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -260,84 +260,84 @@ define void @callee() nounwind {
 ; RV32IZCMP-LABEL: callee:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var)
-; RV32IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var)
+; RV32IZCMP-NEXT:    lw a0, %lo(var)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var)(a5)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV32IZCMP-WITH-FP-LABEL: callee:
@@ -360,81 +360,81 @@ define void @callee() nounwind {
 ; RV32IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    addi a2, a6, %lo(var)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -76(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -80(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t5, 28(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t1, 92(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s1, 104(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a5, 108(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t4, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw t3, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw t2, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a5, 108(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s1, 104(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t1, 92(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t5, 40(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -80(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -76(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t2, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw t3, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw t4, %lo(var+4)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
@@ -469,84 +469,84 @@ define void @callee() nounwind {
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var)
-; RV64I-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-NEXT:    lui a4, %hi(var)
+; RV64I-NEXT:    lw a0, %lo(var)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-NEXT:    sw a0, %lo(var)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -580,86 +580,86 @@ define void @callee() nounwind {
 ; RV64I-WITH-FP-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    addi s0, sp, 160
-; RV64I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    lui a5, %hi(var)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a5)
 ; RV64I-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    addi a2, a5, %lo(var)
+; RV64I-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV64I-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -152(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -160(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw t1, 28(a5)
-; RV64I-WITH-FP-NEXT:    lw t2, 32(a5)
-; RV64I-WITH-FP-NEXT:    lw t3, 36(a5)
-; RV64I-WITH-FP-NEXT:    lw t4, 40(a5)
-; RV64I-WITH-FP-NEXT:    lw t5, 44(a5)
-; RV64I-WITH-FP-NEXT:    lw t6, 48(a5)
-; RV64I-WITH-FP-NEXT:    lw s1, 52(a5)
-; RV64I-WITH-FP-NEXT:    lw s2, 56(a5)
-; RV64I-WITH-FP-NEXT:    lw s3, 60(a5)
-; RV64I-WITH-FP-NEXT:    lw s4, 64(a5)
-; RV64I-WITH-FP-NEXT:    lw s5, 68(a5)
-; RV64I-WITH-FP-NEXT:    lw s6, 72(a5)
-; RV64I-WITH-FP-NEXT:    lw s7, 76(a5)
-; RV64I-WITH-FP-NEXT:    lw s8, 80(a5)
-; RV64I-WITH-FP-NEXT:    lw s9, 84(a5)
-; RV64I-WITH-FP-NEXT:    lw s10, 88(a5)
-; RV64I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    sw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    sw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    sw s10, 88(a5)
-; RV64I-WITH-FP-NEXT:    sw s9, 84(a5)
-; RV64I-WITH-FP-NEXT:    sw s8, 80(a5)
-; RV64I-WITH-FP-NEXT:    sw s7, 76(a5)
-; RV64I-WITH-FP-NEXT:    sw s6, 72(a5)
-; RV64I-WITH-FP-NEXT:    sw s5, 68(a5)
-; RV64I-WITH-FP-NEXT:    sw s4, 64(a5)
-; RV64I-WITH-FP-NEXT:    sw s3, 60(a5)
-; RV64I-WITH-FP-NEXT:    sw s2, 56(a5)
-; RV64I-WITH-FP-NEXT:    sw s1, 52(a5)
-; RV64I-WITH-FP-NEXT:    sw t6, 48(a5)
-; RV64I-WITH-FP-NEXT:    sw t5, 44(a5)
-; RV64I-WITH-FP-NEXT:    sw t4, 40(a5)
-; RV64I-WITH-FP-NEXT:    sw t3, 36(a5)
-; RV64I-WITH-FP-NEXT:    sw t2, 32(a5)
-; RV64I-WITH-FP-NEXT:    sw t1, 28(a5)
+; RV64I-WITH-FP-NEXT:    lw t1, 40(a2)
+; RV64I-WITH-FP-NEXT:    lw t2, 44(a2)
+; RV64I-WITH-FP-NEXT:    lw t3, 48(a2)
+; RV64I-WITH-FP-NEXT:    lw t4, 52(a2)
+; RV64I-WITH-FP-NEXT:    lw t5, 56(a2)
+; RV64I-WITH-FP-NEXT:    lw t6, 60(a2)
+; RV64I-WITH-FP-NEXT:    lw s1, 64(a2)
+; RV64I-WITH-FP-NEXT:    lw s2, 68(a2)
+; RV64I-WITH-FP-NEXT:    lw s3, 72(a2)
+; RV64I-WITH-FP-NEXT:    lw s4, 76(a2)
+; RV64I-WITH-FP-NEXT:    lw s5, 80(a2)
+; RV64I-WITH-FP-NEXT:    lw s6, 84(a2)
+; RV64I-WITH-FP-NEXT:    lw s7, 88(a2)
+; RV64I-WITH-FP-NEXT:    lw s8, 92(a2)
+; RV64I-WITH-FP-NEXT:    lw s9, 96(a2)
+; RV64I-WITH-FP-NEXT:    lw s10, 100(a2)
+; RV64I-WITH-FP-NEXT:    lw s11, 104(a2)
+; RV64I-WITH-FP-NEXT:    lw ra, 108(a2)
+; RV64I-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV64I-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV64I-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV64I-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV64I-WITH-FP-NEXT:    lw t0, %lo(var+4)(a5)
+; RV64I-WITH-FP-NEXT:    lw a7, %lo(var+8)(a5)
+; RV64I-WITH-FP-NEXT:    lw a6, %lo(var+12)(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV64I-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV64I-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV64I-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV64I-WITH-FP-NEXT:    sw ra, 108(a2)
+; RV64I-WITH-FP-NEXT:    sw s11, 104(a2)
+; RV64I-WITH-FP-NEXT:    sw s10, 100(a2)
+; RV64I-WITH-FP-NEXT:    sw s9, 96(a2)
+; RV64I-WITH-FP-NEXT:    sw s8, 92(a2)
+; RV64I-WITH-FP-NEXT:    sw s7, 88(a2)
+; RV64I-WITH-FP-NEXT:    sw s6, 84(a2)
+; RV64I-WITH-FP-NEXT:    sw s5, 80(a2)
+; RV64I-WITH-FP-NEXT:    sw s4, 76(a2)
+; RV64I-WITH-FP-NEXT:    sw s3, 72(a2)
+; RV64I-WITH-FP-NEXT:    sw s2, 68(a2)
+; RV64I-WITH-FP-NEXT:    sw s1, 64(a2)
+; RV64I-WITH-FP-NEXT:    sw t6, 60(a2)
+; RV64I-WITH-FP-NEXT:    sw t5, 56(a2)
+; RV64I-WITH-FP-NEXT:    sw t4, 52(a2)
+; RV64I-WITH-FP-NEXT:    sw t3, 48(a2)
+; RV64I-WITH-FP-NEXT:    sw t2, 44(a2)
+; RV64I-WITH-FP-NEXT:    sw t1, 40(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -160(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -152(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV64I-WITH-FP-NEXT:    sw a6, %lo(var+12)(a5)
+; RV64I-WITH-FP-NEXT:    sw a7, %lo(var+8)(a5)
+; RV64I-WITH-FP-NEXT:    sw t0, %lo(var+4)(a5)
 ; RV64I-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a5)
 ; RV64I-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -679,84 +679,84 @@ define void @callee() nounwind {
 ; RV64IZCMP-LABEL: callee:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var)
-; RV64IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var)
+; RV64IZCMP-NEXT:    lw a0, %lo(var)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var)(a5)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV64IZCMP-WITH-FP-LABEL: callee:
@@ -779,81 +779,81 @@ define void @callee() nounwind {
 ; RV64IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    addi a2, a6, %lo(var)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -152(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -160(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t5, 28(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t1, 92(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s1, 104(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a5, 108(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t4, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw t3, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw t2, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a5, 108(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s1, 104(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t1, 92(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t5, 40(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -160(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -152(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t2, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw t3, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw t4, %lo(var+4)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 0e4702d13a8cd..3a93ac8966025 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a3, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a3, 8(a1)
 ; RV32I-FPELIM-NEXT:    lw a4, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a1)
-; RV32I-FPELIM-NEXT:    lw a6, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 8(a0)
 ; RV32I-FPELIM-NEXT:    lw a7, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a5, a6, a5
+; RV32I-FPELIM-NEXT:    lw a1, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    xor a2, a5, a2
 ; RV32I-FPELIM-NEXT:    xor a4, a7, a4
-; RV32I-FPELIM-NEXT:    or a4, a4, a5
+; RV32I-FPELIM-NEXT:    or a2, a4, a2
+; RV32I-FPELIM-NEXT:    xor a3, a6, a3
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a2, a3, a2
-; RV32I-FPELIM-NEXT:    or a0, a2, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a4
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a3, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a3, 8(a1)
 ; RV32I-WITHFP-NEXT:    lw a4, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a1)
-; RV32I-WITHFP-NEXT:    lw a6, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 8(a0)
 ; RV32I-WITHFP-NEXT:    lw a7, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a5, a6, a5
+; RV32I-WITHFP-NEXT:    lw a1, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    xor a2, a5, a2
 ; RV32I-WITHFP-NEXT:    xor a4, a7, a4
-; RV32I-WITHFP-NEXT:    or a4, a4, a5
+; RV32I-WITHFP-NEXT:    or a2, a4, a2
+; RV32I-WITHFP-NEXT:    xor a3, a6, a3
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a2, a3, a2
-; RV32I-WITHFP-NEXT:    or a0, a2, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a4
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -297,21 +297,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lw a1, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a2, 0(a7)
+; RV32I-FPELIM-NEXT:    lw a1, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 8(a0)
 ; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a7)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
+; RV32I-FPELIM-NEXT:    lw a5, 8(a7)
 ; RV32I-FPELIM-NEXT:    lw a6, 4(a7)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a7, 0(a7)
+; RV32I-FPELIM-NEXT:    xor a1, a4, a1
 ; RV32I-FPELIM-NEXT:    xor a3, a6, a3
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    xor a2, a5, a2
 ; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a2, a1
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    or a0, a0, a2
+; RV32I-FPELIM-NEXT:    or a0, a0, a1
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -322,21 +322,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
-; RV32I-WITHFP-NEXT:    lw a1, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a2, 0(a7)
+; RV32I-WITHFP-NEXT:    lw a1, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 8(a0)
 ; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a7)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
+; RV32I-WITHFP-NEXT:    lw a5, 8(a7)
 ; RV32I-WITHFP-NEXT:    lw a6, 4(a7)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a7, 0(a7)
+; RV32I-WITHFP-NEXT:    xor a1, a4, a1
 ; RV32I-WITHFP-NEXT:    xor a3, a6, a3
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    xor a2, a5, a2
 ; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a2, a1
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    or a0, a0, a2
+; RV32I-WITHFP-NEXT:    or a0, a0, a1
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index adf3630d2a0c9..69ffbb0b2511d 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a1)
-; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    ld a2, 24(a1)
+; RV64I-NEXT:    ld a3, 16(a1)
 ; RV64I-NEXT:    ld a4, 8(a1)
-; RV64I-NEXT:    ld a5, 24(a1)
-; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    ld a5, 24(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
 ; RV64I-NEXT:    ld a7, 8(a0)
-; RV64I-NEXT:    ld a1, 16(a1)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a5, a6, a5
+; RV64I-NEXT:    ld a1, 0(a1)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    xor a2, a5, a2
 ; RV64I-NEXT:    xor a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    xor a3, a6, a3
 ; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a2, a3, a2
-; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -161,21 +161,21 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-LABEL: callee_large_scalars_exhausted_regs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 8(sp)
-; RV64I-NEXT:    ld a1, 0(a0)
-; RV64I-NEXT:    ld a2, 0(a7)
+; RV64I-NEXT:    ld a1, 24(a0)
+; RV64I-NEXT:    ld a2, 16(a0)
 ; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a0)
-; RV64I-NEXT:    ld a5, 24(a7)
+; RV64I-NEXT:    ld a4, 24(a7)
+; RV64I-NEXT:    ld a5, 16(a7)
 ; RV64I-NEXT:    ld a6, 8(a7)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a4, a5, a4
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ld a7, 0(a7)
+; RV64I-NEXT:    xor a1, a4, a1
 ; RV64I-NEXT:    xor a3, a6, a3
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    xor a2, a5, a2
 ; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a2, a1
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index cb64e24128b5e..46b7da2ddc210 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -302,27 +302,27 @@ define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    bgez a2, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    or a6, a4, a3
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    or a6, a1, a3
 ; RV32I-NEXT:    snez a6, a6
 ; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a7
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    snez a5, a4
+; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    add a2, a2, a4
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a4, a5, a6
+; RV32I-NEXT:    snez a5, a1
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
@@ -330,27 +330,27 @@ define i128 @abs128(i128 %x) {
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 0(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    or a6, a4, a3
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    or a6, a1, a3
 ; RV32ZBB-NEXT:    snez a6, a6
 ; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a7
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    snez a5, a4
+; RV32ZBB-NEXT:    snez a4, a4
+; RV32ZBB-NEXT:    add a2, a2, a4
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a4, a5, a6
+; RV32ZBB-NEXT:    snez a5, a1
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:  .LBB8_2:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
@@ -384,27 +384,27 @@ define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    bgez a2, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    or a6, a4, a3
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    or a6, a1, a3
 ; RV32I-NEXT:    snez a6, a6
 ; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a7
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    snez a5, a4
+; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    add a2, a2, a4
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a4, a5, a6
+; RV32I-NEXT:    snez a5, a1
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
@@ -412,27 +412,27 @@ define i128 @select_abs128(i128 %x) {
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 0(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    or a6, a4, a3
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    or a6, a1, a3
 ; RV32ZBB-NEXT:    snez a6, a6
 ; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a7
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    snez a5, a4
+; RV32ZBB-NEXT:    snez a4, a4
+; RV32ZBB-NEXT:    add a2, a2, a4
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a4, a5, a6
+; RV32ZBB-NEXT:    snez a5, a1
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:  .LBB9_2:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index fb7e4a4d103d0..e025b715af1d9 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -3,15 +3,13 @@
 ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
 
 define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: udiv_i128:
-; CHECK:    call __udivti3
   %res = udiv i128 %x, %y
   ret i128 %res
 }
 
 define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
-; CHECK-LABEL: udiv_i129:
-; CHECK-NOT: call{{.*}}div
   %res = udiv i129 %x, %y
   ret i129 %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 15abc9b75883c..8ce5031780c8a 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -7,14 +7,14 @@
 define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV32-LABEL: ctz_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a2, 12(a0)
-; RV32-NEXT:    lw a4, 8(a0)
-; RV32-NEXT:    seqz a0, a3
+; RV32-NEXT:    seqz a0, a4
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    andi a0, a0, 4
-; RV32-NEXT:    seqz a3, a4
+; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    andi a3, a3, 2
 ; RV32-NEXT:    bltu a3, a0, .LBB0_2
@@ -40,14 +40,14 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ;
 ; RV64-LABEL: ctz_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    lw a3, 16(a0)
+; RV64-NEXT:    lw a4, 0(a0)
 ; RV64-NEXT:    lw a1, 8(a0)
 ; RV64-NEXT:    lw a2, 24(a0)
-; RV64-NEXT:    lw a4, 16(a0)
-; RV64-NEXT:    seqz a0, a3
+; RV64-NEXT:    seqz a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    andi a0, a0, 4
-; RV64-NEXT:    seqz a3, a4
+; RV64-NEXT:    seqz a3, a3
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    andi a3, a3, 2
 ; RV64-NEXT:    bltu a3, a0, .LBB0_2
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index 13d03c5217fb1..6d871dccbfcd6 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -56,15 +56,15 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a2, 8(a1)
 ; RV32-NEXT:    lw a3, 12(a1)
-; RV32-NEXT:    lw a4, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lui a5, 524288
 ; RV32-NEXT:    xor a3, a3, a5
-; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a2, 8(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    sw a4, 4(a0)
 ; RV32-NEXT:    sw a3, 12(a0)
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index bfac15e009f00..0e5867800e935 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -222,8 +222,8 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset s1, -24
 ; RV64IFD-NEXT:    .cfi_offset s2, -32
 ; RV64IFD-NEXT:    .cfi_offset fs0, -40
-; RV64IFD-NEXT:    lhu s1, 16(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu s2, 16(a1)
 ; RV64IFD-NEXT:    lhu a1, 8(a1)
 ; RV64IFD-NEXT:    mv s0, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a1
@@ -231,23 +231,23 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fs0
-; RV64IFD-NEXT:    slli s2, a0, 16
+; RV64IFD-NEXT:    slli s1, a0, 16
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    slli a0, a0, 48
 ; RV64IFD-NEXT:    srli a0, a0, 48
-; RV64IFD-NEXT:    or s2, a0, s2
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    or s1, a0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    sh a0, 4(s0)
-; RV64IFD-NEXT:    sw s2, 0(s0)
+; RV64IFD-NEXT:    sw s1, 0(s0)
 ; RV64IFD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -349,9 +349,9 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset fs0, -48
 ; RV64IFD-NEXT:    .cfi_offset fs1, -56
 ; RV64IFD-NEXT:    .cfi_offset fs2, -64
-; RV64IFD-NEXT:    lhu s1, 24(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
-; RV64IFD-NEXT:    lhu s3, 8(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu s2, 8(a1)
+; RV64IFD-NEXT:    lhu s3, 24(a1)
 ; RV64IFD-NEXT:    lhu a1, 16(a1)
 ; RV64IFD-NEXT:    mv s0, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a1
@@ -359,17 +359,17 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s3
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs1, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs2, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s3
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    fmv.x.w s1, fs2
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index 4eb969a357a9e..023e95747c2cc 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -1,25 +1,10 @@
 ; REQUIRES: asserts
 ; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
-; RUN:   | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
-; RUN:   | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv32 -riscv-misched-load-clustering -verify-misched \
-; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
-; RUN: llc -mtriple=riscv64 -riscv-misched-load-clustering -verify-misched \
-; RUN:     -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: llc -mtriple=riscv64 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
 
-
 define i32 @load_clustering_1(ptr nocapture %p) {
-; NOCLUSTER: ********** MI Scheduling **********
-; NOCLUSTER-LABEL: load_clustering_1:%bb.0
-; NOCLUSTER: *** Final schedule for %bb.0 ***
-; NOCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
-; NOCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
-; NOCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
-; NOCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
-;
 ; LDCLUSTER: ********** MI Scheduling **********
 ; LDCLUSTER-LABEL: load_clustering_1:%bb.0
 ; LDCLUSTER: *** Final schedule for %bb.0 ***
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index f2b7e8d26328d..33ab4fbaaf66e 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1167,28 +1167,28 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
-; RV32IM-NEXT:    lw a3, 8(a1)
-; RV32IM-NEXT:    lw a4, 0(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 12(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
 ; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    li a5, -15
 ; RV32IM-NEXT:    slli a5, a5, 8
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    mul a7, a1, a5
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
 ; RV32IM-NEXT:    mulhu t0, a1, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a4
-; RV32IM-NEXT:    neg t0, a4
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a4, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
 ; RV32IM-NEXT:    sub t4, t1, a1
-; RV32IM-NEXT:    mul t5, a3, a5
-; RV32IM-NEXT:    sub t5, t5, a4
+; RV32IM-NEXT:    mul t5, a4, a5
+; RV32IM-NEXT:    sub t5, t5, a2
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
 ; RV32IM-NEXT:    neg s1, a1
@@ -1198,17 +1198,17 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    mulhu t1, a3, a5
-; RV32IM-NEXT:    sub a3, t1, a3
-; RV32IM-NEXT:    add a2, a3, a2
-; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    mul a3, a3, a5
+; RV32IM-NEXT:    mulhu t1, a4, a5
+; RV32IM-NEXT:    sub a4, t1, a4
+; RV32IM-NEXT:    add a3, a4, a3
+; RV32IM-NEXT:    add a1, a2, a1
 ; RV32IM-NEXT:    sub a1, t3, a1
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a1, a1, a3
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    mul a2, a4, a5
+; RV32IM-NEXT:    mul a2, a2, a5
 ; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
@@ -1292,17 +1292,17 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
+; RV32IM-NEXT:    lw a2, 4(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw a4, 12(a1)
 ; RV32IM-NEXT:    lw a1, 8(a1)
 ; RV32IM-NEXT:    li a5, -63
 ; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    slli a7, a4, 6
-; RV32IM-NEXT:    sub a7, a4, a7
+; RV32IM-NEXT:    slli a7, a2, 6
+; RV32IM-NEXT:    sub a7, a2, a7
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a4, a5
+; RV32IM-NEXT:    mulhu t0, a2, a5
 ; RV32IM-NEXT:    add a7, t0, a7
 ; RV32IM-NEXT:    sub a6, a6, a3
 ; RV32IM-NEXT:    neg t0, a3
@@ -1311,27 +1311,27 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    mulhu t3, a3, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a4
+; RV32IM-NEXT:    sub t4, t1, a2
 ; RV32IM-NEXT:    slli t5, a1, 6
 ; RV32IM-NEXT:    sub t6, a1, a3
 ; RV32IM-NEXT:    sub t5, t6, t5
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a4
+; RV32IM-NEXT:    neg s1, a2
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a4, t2
+; RV32IM-NEXT:    mulhu t1, a2, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a2, 6
-; RV32IM-NEXT:    sub a2, a2, t1
+; RV32IM-NEXT:    slli t1, a4, 6
+; RV32IM-NEXT:    sub a4, a4, t1
 ; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    sub a5, a5, a1
-; RV32IM-NEXT:    add a2, a5, a2
-; RV32IM-NEXT:    add a4, a3, a4
-; RV32IM-NEXT:    sub a1, t3, a4
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a4, a5, a4
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    sub a1, t3, a2
+; RV32IM-NEXT:    add a1, a1, a4
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 4c5c36fc72d14..f2421970340b5 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -907,54 +907,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -968,54 +968,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1029,44 +1029,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -1090,44 +1090,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -1163,112 +1163,112 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -2321,54 +2321,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -2382,54 +2382,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -2443,44 +2443,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
@@ -2504,44 +2504,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
@@ -2577,112 +2577,112 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -3735,54 +3735,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -3796,54 +3796,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -3857,44 +3857,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
@@ -3918,44 +3918,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
@@ -3991,112 +3991,112 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -5149,54 +5149,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -5210,54 +5210,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -5271,44 +5271,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
@@ -5332,44 +5332,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
@@ -5405,112 +5405,112 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -6563,54 +6563,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -6624,54 +6624,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -6685,44 +6685,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -6746,44 +6746,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -6819,112 +6819,112 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 776944b177636..4521b07fd862c 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -1133,41 +1133,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-NEXT:    lw a6, %lo(var0)(a0)
-; RV32IZCMP-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV32IZCMP-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV32IZCMP-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV32IZCMP-NEXT:    addi a5, a0, %lo(var0)
-; RV32IZCMP-NEXT:    lw t2, 16(a5)
-; RV32IZCMP-NEXT:    lw t3, 20(a5)
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
-; RV32IZCMP-NEXT:    sw t3, 20(a5)
-; RV32IZCMP-NEXT:    sw t2, 16(a5)
-; RV32IZCMP-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV32IZCMP-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV32IZCMP-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV32IZCMP-NEXT:    addi a2, a0, %lo(var0)
+; RV32IZCMP-NEXT:    lw a7, 16(a2)
+; RV32IZCMP-NEXT:    lw t0, 20(a2)
+; RV32IZCMP-NEXT:    lw t1, 24(a2)
+; RV32IZCMP-NEXT:    lw t2, 28(a2)
+; RV32IZCMP-NEXT:    lw t3, 32(a2)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw a3, 48(a2)
+; RV32IZCMP-NEXT:    lw a4, 52(a2)
+; RV32IZCMP-NEXT:    lw a5, 56(a2)
+; RV32IZCMP-NEXT:    lw a1, 60(a2)
+; RV32IZCMP-NEXT:    lw s0, 64(a2)
+; RV32IZCMP-NEXT:    lw s1, 68(a2)
+; RV32IZCMP-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32IZCMP-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32IZCMP-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32IZCMP-NEXT:    sw s1, 68(a2)
+; RV32IZCMP-NEXT:    sw s0, 64(a2)
+; RV32IZCMP-NEXT:    sw a1, 60(a2)
+; RV32IZCMP-NEXT:    sw a5, 56(a2)
+; RV32IZCMP-NEXT:    sw a4, 52(a2)
+; RV32IZCMP-NEXT:    sw a3, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
+; RV32IZCMP-NEXT:    sw t3, 32(a2)
+; RV32IZCMP-NEXT:    sw t2, 28(a2)
+; RV32IZCMP-NEXT:    sw t1, 24(a2)
+; RV32IZCMP-NEXT:    sw t0, 20(a2)
+; RV32IZCMP-NEXT:    sw a7, 16(a2)
+; RV32IZCMP-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32IZCMP-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32IZCMP-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32IZCMP-NEXT:    sw a6, %lo(var0)(a0)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
@@ -1176,41 +1176,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-NEXT:    lw a6, %lo(var0)(a0)
-; RV64IZCMP-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV64IZCMP-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV64IZCMP-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV64IZCMP-NEXT:    addi a5, a0, %lo(var0)
-; RV64IZCMP-NEXT:    lw t2, 16(a5)
-; RV64IZCMP-NEXT:    lw t3, 20(a5)
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
-; RV64IZCMP-NEXT:    sw t3, 20(a5)
-; RV64IZCMP-NEXT:    sw t2, 16(a5)
-; RV64IZCMP-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV64IZCMP-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV64IZCMP-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV64IZCMP-NEXT:    addi a2, a0, %lo(var0)
+; RV64IZCMP-NEXT:    lw a7, 16(a2)
+; RV64IZCMP-NEXT:    lw t0, 20(a2)
+; RV64IZCMP-NEXT:    lw t1, 24(a2)
+; RV64IZCMP-NEXT:    lw t2, 28(a2)
+; RV64IZCMP-NEXT:    lw t3, 32(a2)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw a3, 48(a2)
+; RV64IZCMP-NEXT:    lw a4, 52(a2)
+; RV64IZCMP-NEXT:    lw a5, 56(a2)
+; RV64IZCMP-NEXT:    lw a1, 60(a2)
+; RV64IZCMP-NEXT:    lw s0, 64(a2)
+; RV64IZCMP-NEXT:    lw s1, 68(a2)
+; RV64IZCMP-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64IZCMP-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64IZCMP-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64IZCMP-NEXT:    sw s1, 68(a2)
+; RV64IZCMP-NEXT:    sw s0, 64(a2)
+; RV64IZCMP-NEXT:    sw a1, 60(a2)
+; RV64IZCMP-NEXT:    sw a5, 56(a2)
+; RV64IZCMP-NEXT:    sw a4, 52(a2)
+; RV64IZCMP-NEXT:    sw a3, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
+; RV64IZCMP-NEXT:    sw t3, 32(a2)
+; RV64IZCMP-NEXT:    sw t2, 28(a2)
+; RV64IZCMP-NEXT:    sw t1, 24(a2)
+; RV64IZCMP-NEXT:    sw t0, 20(a2)
+; RV64IZCMP-NEXT:    sw a7, 16(a2)
+; RV64IZCMP-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64IZCMP-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64IZCMP-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64IZCMP-NEXT:    sw a6, %lo(var0)(a0)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
@@ -1219,41 +1219,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
-; RV32IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV32IZCMP-SR-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV32IZCMP-SR-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV32IZCMP-SR-NEXT:    addi a5, a0, %lo(var0)
-; RV32IZCMP-SR-NEXT:    lw t2, 16(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 20(a5)
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 20(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 16(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV32IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV32IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV32IZCMP-SR-NEXT:    addi a2, a0, %lo(var0)
+; RV32IZCMP-SR-NEXT:    lw a7, 16(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 20(a2)
+; RV32IZCMP-SR-NEXT:    lw t1, 24(a2)
+; RV32IZCMP-SR-NEXT:    lw t2, 28(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, 32(a2)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw a5, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32IZCMP-SR-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32IZCMP-SR-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32IZCMP-SR-NEXT:    sw s1, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw a5, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    sw t3, 32(a2)
+; RV32IZCMP-SR-NEXT:    sw t2, 28(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, 24(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 20(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32IZCMP-SR-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32IZCMP-SR-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
@@ -1262,41 +1262,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
-; RV64IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV64IZCMP-SR-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV64IZCMP-SR-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV64IZCMP-SR-NEXT:    addi a5, a0, %lo(var0)
-; RV64IZCMP-SR-NEXT:    lw t2, 16(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 20(a5)
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 20(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 16(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV64IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV64IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV64IZCMP-SR-NEXT:    addi a2, a0, %lo(var0)
+; RV64IZCMP-SR-NEXT:    lw a7, 16(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 20(a2)
+; RV64IZCMP-SR-NEXT:    lw t1, 24(a2)
+; RV64IZCMP-SR-NEXT:    lw t2, 28(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, 32(a2)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw a5, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64IZCMP-SR-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64IZCMP-SR-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64IZCMP-SR-NEXT:    sw s1, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw a5, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    sw t3, 32(a2)
+; RV64IZCMP-SR-NEXT:    sw t2, 28(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, 24(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 20(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64IZCMP-SR-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64IZCMP-SR-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
@@ -1310,41 +1310,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32I-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(var0)
 ; RV32I-NEXT:    lw a1, %lo(var0)(a0)
-; RV32I-NEXT:    lw a2, %lo(var0+4)(a0)
-; RV32I-NEXT:    lw a3, %lo(var0+8)(a0)
-; RV32I-NEXT:    lw a4, %lo(var0+12)(a0)
-; RV32I-NEXT:    addi a5, a0, %lo(var0)
-; RV32I-NEXT:    lw a6, 16(a5)
-; RV32I-NEXT:    lw a7, 20(a5)
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 68(a5)
-; RV32I-NEXT:    lw s2, 64(a5)
-; RV32I-NEXT:    lw s3, 60(a5)
-; RV32I-NEXT:    lw s4, 56(a5)
-; RV32I-NEXT:    sw s1, 68(a5)
-; RV32I-NEXT:    sw s2, 64(a5)
-; RV32I-NEXT:    sw s3, 60(a5)
-; RV32I-NEXT:    sw s4, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
-; RV32I-NEXT:    sw a7, 20(a5)
-; RV32I-NEXT:    sw a6, 16(a5)
-; RV32I-NEXT:    sw a4, %lo(var0+12)(a0)
-; RV32I-NEXT:    sw a3, %lo(var0+8)(a0)
-; RV32I-NEXT:    sw a2, %lo(var0+4)(a0)
+; RV32I-NEXT:    addi a2, a0, %lo(var0)
+; RV32I-NEXT:    lw a3, 16(a2)
+; RV32I-NEXT:    lw a4, 20(a2)
+; RV32I-NEXT:    lw a5, 24(a2)
+; RV32I-NEXT:    lw a6, 28(a2)
+; RV32I-NEXT:    lw a7, 32(a2)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32I-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32I-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
+; RV32I-NEXT:    sw a7, 32(a2)
+; RV32I-NEXT:    sw a6, 28(a2)
+; RV32I-NEXT:    sw a5, 24(a2)
+; RV32I-NEXT:    sw a4, 20(a2)
+; RV32I-NEXT:    sw a3, 16(a2)
+; RV32I-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32I-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32I-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32I-NEXT:    sw a1, %lo(var0)(a0)
 ; RV32I-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
@@ -1364,41 +1364,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64I-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lui a0, %hi(var0)
 ; RV64I-NEXT:    lw a1, %lo(var0)(a0)
-; RV64I-NEXT:    lw a2, %lo(var0+4)(a0)
-; RV64I-NEXT:    lw a3, %lo(var0+8)(a0)
-; RV64I-NEXT:    lw a4, %lo(var0+12)(a0)
-; RV64I-NEXT:    addi a5, a0, %lo(var0)
-; RV64I-NEXT:    lw a6, 16(a5)
-; RV64I-NEXT:    lw a7, 20(a5)
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 68(a5)
-; RV64I-NEXT:    lw s2, 64(a5)
-; RV64I-NEXT:    lw s3, 60(a5)
-; RV64I-NEXT:    lw s4, 56(a5)
-; RV64I-NEXT:    sw s1, 68(a5)
-; RV64I-NEXT:    sw s2, 64(a5)
-; RV64I-NEXT:    sw s3, 60(a5)
-; RV64I-NEXT:    sw s4, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
-; RV64I-NEXT:    sw a7, 20(a5)
-; RV64I-NEXT:    sw a6, 16(a5)
-; RV64I-NEXT:    sw a4, %lo(var0+12)(a0)
-; RV64I-NEXT:    sw a3, %lo(var0+8)(a0)
-; RV64I-NEXT:    sw a2, %lo(var0+4)(a0)
+; RV64I-NEXT:    addi a2, a0, %lo(var0)
+; RV64I-NEXT:    lw a3, 16(a2)
+; RV64I-NEXT:    lw a4, 20(a2)
+; RV64I-NEXT:    lw a5, 24(a2)
+; RV64I-NEXT:    lw a6, 28(a2)
+; RV64I-NEXT:    lw a7, 32(a2)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64I-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64I-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
+; RV64I-NEXT:    sw a7, 32(a2)
+; RV64I-NEXT:    sw a6, 28(a2)
+; RV64I-NEXT:    sw a5, 24(a2)
+; RV64I-NEXT:    sw a4, 20(a2)
+; RV64I-NEXT:    sw a3, 16(a2)
+; RV64I-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64I-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64I-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64I-NEXT:    sw a1, %lo(var0)(a0)
 ; RV64I-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
@@ -1837,84 +1837,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -1953,84 +1953,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2069,84 +2069,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -2185,84 +2185,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2313,84 +2313,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32I-NEXT:    sw t4, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t5, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t6, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t1, 132(sp) # 4-byte Folded Reload
@@ -2453,84 +2453,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64I-NEXT:    sd t4, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t5, 56(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t6, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 264(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t0, 256(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t1, 248(sp) # 8-byte Folded Reload
@@ -2570,333 +2570,333 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-LABEL: callee_no_irq:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-LABEL: callee_no_irq:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32IZCMP-SR-LABEL: callee_no_irq:
 ; RV32IZCMP-SR:       # %bb.0:
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-SR-LABEL: callee_no_irq:
 ; RV64IZCMP-SR:       # %bb.0:
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32I-LABEL: callee_no_irq:
@@ -2915,84 +2915,84 @@ define void @callee_no_irq() nounwind{
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -3025,84 +3025,84 @@ define void @callee_no_irq() nounwind{
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index b2dea4237f5a5..6a605b2cc53ae 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -8,24 +8,24 @@
 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_sum_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_sum_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 24(a0)
-; RV64-NEXT:    lw a2, 8(a0)
-; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    lw a1, 8(a0)
+; RV64-NEXT:    lw a2, 0(a0)
+; RV64-NEXT:    lw a3, 24(a0)
 ; RV64-NEXT:    lw a0, 16(a0)
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addw a0, a2, a0
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -40,24 +40,24 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_xor_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    xor a2, a3, a2
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    xor a1, a2, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_xor_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a1, 8(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 24(a0)
 ; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    xor a2, a3, a2
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    xor a1, a2, a1
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -72,24 +72,24 @@ define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 define i32 @reduce_or_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_or_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    or a1, a2, a1
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_or_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a1, 8(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 24(a0)
 ; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    or a1, a2, a1
+; RV64-NEXT:    or a0, a0, a3
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 5f9ca503bcb05..56fe3340c83e7 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -749,22 +749,22 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw s5, 12(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a0, 4(a1)
 ; RV32I-NEXT:    lw s2, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
 ; RV32I-NEXT:    lw s6, 0(a1)
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi s3, a2, 1365
-; RV32I-NEXT:    and a1, a1, s3
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi s4, a1, 819
-; RV32I-NEXT:    and a1, a0, s4
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, s4
-; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a0, a2, 1
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    addi s3, a1, 1365
+; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    sub a2, a2, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi s4, a0, 819
+; RV32I-NEXT:    and a0, a2, s4
+; RV32I-NEXT:    srli a2, a2, 2
+; RV32I-NEXT:    and a1, a2, s4
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    lui a1, 61681
@@ -835,20 +835,20 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 4(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a1, 12(a1)
-; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    lw a2, 12(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    cpop a4, a4
+; RV32ZBB-NEXT:    add a3, a4, a3
+; RV32ZBB-NEXT:    cpop a2, a2
 ; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a3, a4
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    sw zero, 12(a0)
 ; RV32ZBB-NEXT:    sw zero, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 0(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index f38aa71fb158d..6c4466796aeed 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -177,12 +177,12 @@ define i8 @test13(ptr %0, i64 %1) {
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    subw a2, a2, a1
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    lbu a2, 0(a2)
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    subw a3, a3, a1
 ; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    lbu a1, 0(a2)
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
   %3 = mul i64 %1, -4294967296
   %4 = add i64 %3, 4294967296 ; 1 << 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index d34c10798f482..f6d1d3882e5e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -8,13 +8,13 @@ declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-LABEL: vpreduce_add_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a5, 12(a1)
-; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    lw a4, 12(a1)
+; RV32-NEXT:    lw a5, 8(a1)
+; RV32-NEXT:    lw a6, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lw a7, 0(a2)
-; RV32-NEXT:    lw t0, 8(a2)
-; RV32-NEXT:    lw t1, 12(a2)
+; RV32-NEXT:    lw t0, 12(a2)
+; RV32-NEXT:    lw t1, 8(a2)
 ; RV32-NEXT:    lw a2, 4(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
@@ -24,32 +24,32 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    sltiu a3, a3, 2
 ; RV32-NEXT:    xori a3, a3, 1
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a3, t4, t1
-; RV32-NEXT:    and t0, t3, t0
+; RV32-NEXT:    and a3, t4, t0
+; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
 ; RV32-NEXT:    and a1, a7, a1
 ; RV32-NEXT:    neg a7, t0
-; RV32-NEXT:    and a6, a7, a6
+; RV32-NEXT:    and a5, a7, a5
 ; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a3, a3, a5
+; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    and a2, a2, a6
 ; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a1, a1, a6
+; RV32-NEXT:    add a1, a1, a5
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpreduce_add_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a4, 8(a1)
-; RV64-NEXT:    lw a5, 24(a1)
-; RV64-NEXT:    lw a6, 16(a1)
+; RV64-NEXT:    lw a4, 24(a1)
+; RV64-NEXT:    lw a5, 16(a1)
+; RV64-NEXT:    lw a6, 8(a1)
 ; RV64-NEXT:    lw a1, 0(a1)
 ; RV64-NEXT:    ld a7, 0(a2)
-; RV64-NEXT:    ld t0, 16(a2)
-; RV64-NEXT:    ld t1, 24(a2)
+; RV64-NEXT:    ld t0, 24(a2)
+; RV64-NEXT:    ld t1, 16(a2)
 ; RV64-NEXT:    ld a2, 8(a2)
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
@@ -60,19 +60,19 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    sltiu a3, a3, 2
 ; RV64-NEXT:    xori a3, a3, 1
 ; RV64-NEXT:    and a2, a3, a2
-; RV64-NEXT:    and a3, t4, t1
-; RV64-NEXT:    and t0, t3, t0
+; RV64-NEXT:    and a3, t4, t0
+; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
 ; RV64-NEXT:    and a1, a7, a1
 ; RV64-NEXT:    negw a7, t0
-; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    and a5, a7, a5
 ; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    and a3, a3, a5
+; RV64-NEXT:    and a3, a3, a4
 ; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    and a2, a2, a4
+; RV64-NEXT:    and a2, a2, a6
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a1, a1, a6
+; RV64-NEXT:    add a1, a1, a5
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 8ed19ddb1af5c..24a7655dc35a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -26,26 +26,26 @@ define void @add_v4i32(ptr %x, ptr %y) {
 define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    lw a3, 12(a0)
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
+; RV32-NEXT:    lw a2, 12(a0)
+; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
+; RV32-NEXT:    lw a5, 12(a1)
 ; RV32-NEXT:    lw a6, 4(a1)
 ; RV32-NEXT:    lw a7, 0(a1)
-; RV32-NEXT:    lw t0, 8(a1)
-; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a7, a4, a7
-; RV32-NEXT:    sltu a4, a7, a4
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add t0, a2, t0
-; RV32-NEXT:    sltu a2, t0, a2
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    sw t0, 8(a0)
+; RV32-NEXT:    lw t0, 8(a0)
+; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a7, a3, a7
+; RV32-NEXT:    sltu a3, a7, a3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    add a1, t0, a1
+; RV32-NEXT:    sltu a4, a1, t0
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    sw a1, 8(a0)
 ; RV32-NEXT:    sw a7, 0(a0)
-; RV32-NEXT:    sw a1, 12(a0)
-; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc..799e20074b042 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,18 +7,18 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 80(a0)
-; ZVE32X-NEXT:    ld a2, 72(a0)
-; ZVE32X-NEXT:    ld a3, 56(a0)
-; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 24(a0)
+; ZVE32X-NEXT:    ld a1, 32(a0)
+; ZVE32X-NEXT:    ld a2, 24(a0)
+; ZVE32X-NEXT:    ld a3, 80(a0)
+; ZVE32X-NEXT:    ld a4, 72(a0)
+; ZVE32X-NEXT:    ld a5, 56(a0)
 ; ZVE32X-NEXT:    ld a6, 48(a0)
 ; ZVE32X-NEXT:    ld a7, 8(a0)
 ; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a4, a5, a4
-; ZVE32X-NEXT:    snez a4, a4
+; ZVE32X-NEXT:    xor a1, a2, a1
+; ZVE32X-NEXT:    snez a1, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a4
+; ZVE32X-NEXT:    vmv.s.x v8, a1
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
@@ -36,7 +36,7 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a0, a6, a3
+; ZVE32X-NEXT:    xor a0, a6, a5
 ; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v11, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -48,8 +48,8 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v9, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a0, a1
+; ZVE32X-NEXT:    xor a3, a4, a3
+; ZVE32X-NEXT:    snez a0, a3
 ; ZVE32X-NEXT:    vmv.s.x v10, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index d74fd6cd3f034..221f9a005bc23 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3966,22 +3966,22 @@ define <4 x i64> @mgather_falsemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru)
 ;
 ; RV32ZVE32F-LABEL: mgather_falsemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 0(a1)
-; RV32ZVE32F-NEXT:    lw a3, 4(a1)
-; RV32ZVE32F-NEXT:    lw a4, 8(a1)
-; RV32ZVE32F-NEXT:    lw a5, 12(a1)
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a7, 24(a1)
-; RV32ZVE32F-NEXT:    lw t0, 20(a1)
+; RV32ZVE32F-NEXT:    lw a2, 20(a1)
+; RV32ZVE32F-NEXT:    lw a3, 24(a1)
+; RV32ZVE32F-NEXT:    lw a4, 28(a1)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a7, 8(a1)
+; RV32ZVE32F-NEXT:    lw t0, 12(a1)
 ; RV32ZVE32F-NEXT:    lw a1, 16(a1)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
-; RV32ZVE32F-NEXT:    sw a7, 24(a0)
-; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a4, 28(a0)
+; RV32ZVE32F-NEXT:    sw a3, 24(a0)
+; RV32ZVE32F-NEXT:    sw a2, 20(a0)
 ; RV32ZVE32F-NEXT:    sw a1, 16(a0)
-; RV32ZVE32F-NEXT:    sw a5, 12(a0)
-; RV32ZVE32F-NEXT:    sw a4, 8(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t0, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 8(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_falsemask_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 846295b3ead27..534f80a302229 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -753,18 +753,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    ld a6, 8(a1)
-; ZVE32F-NEXT:    ld a7, 0(a1)
-; ZVE32F-NEXT:    ld t0, 24(a1)
-; ZVE32F-NEXT:    ld t1, 16(a1)
+; ZVE32F-NEXT:    ld a6, 24(a1)
+; ZVE32F-NEXT:    ld a7, 16(a1)
+; ZVE32F-NEXT:    ld t0, 8(a1)
+; ZVE32F-NEXT:    ld t1, 0(a1)
 ; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    add t3, a0, t3
-; ZVE32F-NEXT:    sd a7, 0(t3)
-; ZVE32F-NEXT:    sd a6, 0(t2)
-; ZVE32F-NEXT:    sd t1, 80(t3)
-; ZVE32F-NEXT:    sd t0, 80(t2)
+; ZVE32F-NEXT:    sd t1, 0(t3)
+; ZVE32F-NEXT:    sd t0, 0(t2)
+; ZVE32F-NEXT:    sd a7, 80(t3)
+; ZVE32F-NEXT:    sd a6, 80(t2)
 ; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a3, a3, 4
 ; ZVE32F-NEXT:    addi a4, a4, -4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 7497051027fa3..352e3a2df1539 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -364,21 +364,21 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -459,9 +459,9 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -470,7 +470,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -483,7 +483,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -499,7 +499,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -631,9 +631,9 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -642,7 +642,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -655,7 +655,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -671,7 +671,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -812,9 +812,9 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -823,7 +823,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -836,7 +836,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -852,7 +852,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -1267,37 +1267,37 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -1448,13 +1448,13 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -1463,7 +1463,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -1472,7 +1472,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -1481,7 +1481,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -1731,13 +1731,13 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -1746,7 +1746,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -1755,7 +1755,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -1764,7 +1764,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -2034,13 +2034,13 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -2049,7 +2049,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -2058,7 +2058,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -2067,7 +2067,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -3700,21 +3700,21 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -3795,9 +3795,9 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -3806,7 +3806,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -3819,7 +3819,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -3835,7 +3835,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -3965,9 +3965,9 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -3976,7 +3976,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -3989,7 +3989,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -4005,7 +4005,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4058,21 +4058,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4145,9 +4145,9 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -4156,7 +4156,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -4169,7 +4169,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -4185,7 +4185,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4588,37 +4588,37 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4769,13 +4769,13 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -4784,7 +4784,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -4793,7 +4793,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -4802,7 +4802,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -5048,13 +5048,13 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -5063,7 +5063,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -5072,7 +5072,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -5081,7 +5081,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -5187,37 +5187,37 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -5350,13 +5350,13 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -5365,7 +5365,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -5374,7 +5374,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -5383,7 +5383,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index 65dca0daed8c7..bf4dbe7ee14ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -9,20 +9,20 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    lhu s0, 6(a0)
-; CHECK-NEXT:    lhu s1, 4(a0)
-; CHECK-NEXT:    lhu s2, 0(a0)
+; CHECK-NEXT:    lhu s0, 0(a0)
+; CHECK-NEXT:    lhu s1, 6(a0)
+; CHECK-NEXT:    lhu s2, 4(a0)
 ; CHECK-NEXT:    lhu a0, 2(a0)
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 8(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s2
+; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 0(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s1
+; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 12(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s0
+; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 4(sp)
 ; CHECK-NEXT:    addi a0, sp, 8
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 97121c275a294..40adbbcd41fcd 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -151,12 +151,19 @@ define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: lshr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
 ; RV32I-NEXT:    sb zero, 31(sp)
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
@@ -169,94 +176,90 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sb zero, 22(sp)
 ; RV32I-NEXT:    sb zero, 21(sp)
 ; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb a1, 12(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    sb a4, 4(sp)
-; RV32I-NEXT:    sb a3, 0(sp)
+; RV32I-NEXT:    sb a1, 16(sp)
+; RV32I-NEXT:    sb a5, 12(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a3, 4(sp)
 ; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 15(sp)
+; RV32I-NEXT:    sb a6, 19(sp)
 ; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 14(sp)
+; RV32I-NEXT:    sb a6, 18(sp)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(sp)
+; RV32I-NEXT:    sb a1, 17(sp)
 ; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 11(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 10(sp)
+; RV32I-NEXT:    sb a1, 14(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(sp)
+; RV32I-NEXT:    sb a5, 13(sp)
 ; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(sp)
+; RV32I-NEXT:    sb a1, 11(sp)
 ; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(sp)
+; RV32I-NEXT:    sb a1, 10(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(sp)
+; RV32I-NEXT:    sb a4, 9(sp)
 ; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 3(sp)
+; RV32I-NEXT:    sb a1, 7(sp)
 ; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 2(sp)
+; RV32I-NEXT:    sb a1, 6(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(sp)
+; RV32I-NEXT:    sb a3, 5(sp)
 ; RV32I-NEXT:    slli a1, a2, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    addi a3, sp, 4
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, t0, a7
 ; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a5, t2, t1
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
 ; RV32I-NEXT:    xori a6, a2, 31
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a7, t6, t5
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
 ; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    srl a4, a4, a2
 ; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -265,7 +268,10 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: lshr128:
@@ -293,125 +299,131 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: ashr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw a3, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a5, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a6, 4(a1)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    sb a4, 16(sp)
 ; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a6, 8(sp)
 ; RV32I-NEXT:    sb a5, 4(sp)
-; RV32I-NEXT:    sb a1, 0(sp)
-; RV32I-NEXT:    srai a6, a3, 31
-; RV32I-NEXT:    sb a6, 28(sp)
-; RV32I-NEXT:    sb a6, 24(sp)
-; RV32I-NEXT:    sb a6, 20(sp)
-; RV32I-NEXT:    sb a6, 16(sp)
-; RV32I-NEXT:    srli a7, a3, 24
-; RV32I-NEXT:    sb a7, 15(sp)
-; RV32I-NEXT:    srli a7, a3, 16
-; RV32I-NEXT:    sb a7, 14(sp)
+; RV32I-NEXT:    srai a2, a4, 31
+; RV32I-NEXT:    sb a2, 32(sp)
+; RV32I-NEXT:    sb a2, 28(sp)
+; RV32I-NEXT:    sb a2, 24(sp)
+; RV32I-NEXT:    sb a2, 20(sp)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 19(sp)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 18(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(sp)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 13(sp)
-; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    srli a3, a6, 24
 ; RV32I-NEXT:    sb a3, 11(sp)
-; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    srli a3, a6, 16
 ; RV32I-NEXT:    sb a3, 10(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(sp)
+; RV32I-NEXT:    srli a3, a6, 8
+; RV32I-NEXT:    sb a3, 9(sp)
 ; RV32I-NEXT:    srli a3, a5, 24
 ; RV32I-NEXT:    sb a3, 7(sp)
 ; RV32I-NEXT:    srli a3, a5, 16
 ; RV32I-NEXT:    sb a3, 6(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 5(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(sp)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(sp)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    srli a3, a6, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a3, a2, 24
+; RV32I-NEXT:    sb a3, 35(sp)
+; RV32I-NEXT:    srli a4, a2, 16
+; RV32I-NEXT:    sb a4, 34(sp)
+; RV32I-NEXT:    srli a2, a2, 8
+; RV32I-NEXT:    sb a2, 33(sp)
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    sb a2, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a2, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a2, 21(sp)
+; RV32I-NEXT:    slli a2, a1, 25
+; RV32I-NEXT:    srli a2, a2, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a2, a3, a2
+; RV32I-NEXT:    lbu a3, 0(a2)
+; RV32I-NEXT:    lbu a4, 1(a2)
+; RV32I-NEXT:    lbu a5, 2(a2)
+; RV32I-NEXT:    lbu a6, 3(a2)
+; RV32I-NEXT:    lbu a7, 4(a2)
+; RV32I-NEXT:    lbu t0, 5(a2)
+; RV32I-NEXT:    lbu t1, 6(a2)
+; RV32I-NEXT:    lbu t2, 7(a2)
+; RV32I-NEXT:    lbu t3, 8(a2)
+; RV32I-NEXT:    lbu t4, 9(a2)
+; RV32I-NEXT:    lbu t5, 10(a2)
+; RV32I-NEXT:    lbu t6, 11(a2)
+; RV32I-NEXT:    lbu s0, 12(a2)
+; RV32I-NEXT:    lbu s1, 13(a2)
+; RV32I-NEXT:    lbu s2, 14(a2)
+; RV32I-NEXT:    lbu a2, 15(a2)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    andi a1, a1, 7
+; RV32I-NEXT:    srl a3, a3, a1
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a5, t2, t1
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    xori a6, a1, 31
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a7, t6, t5
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
+; RV32I-NEXT:    not t0, a1
 ; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a2, a2, 24
+; RV32I-NEXT:    or a2, a2, s2
+; RV32I-NEXT:    or a2, a2, s0
+; RV32I-NEXT:    slli a7, a2, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    sra a1, a2, a1
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ashr128:
@@ -439,12 +451,19 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: shl128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 19(sp)
+; RV32I-NEXT:    sb zero, 18(sp)
+; RV32I-NEXT:    sb zero, 17(sp)
+; RV32I-NEXT:    sb zero, 16(sp)
 ; RV32I-NEXT:    sb zero, 15(sp)
 ; RV32I-NEXT:    sb zero, 14(sp)
 ; RV32I-NEXT:    sb zero, 13(sp)
@@ -457,103 +476,102 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sb zero, 6(sp)
 ; RV32I-NEXT:    sb zero, 5(sp)
 ; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb zero, 3(sp)
-; RV32I-NEXT:    sb zero, 2(sp)
-; RV32I-NEXT:    sb zero, 1(sp)
-; RV32I-NEXT:    sb zero, 0(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a5, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a3, 16(sp)
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    sb a5, 28(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    sb a3, 20(sp)
 ; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 31(sp)
+; RV32I-NEXT:    sb a6, 35(sp)
 ; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 30(sp)
+; RV32I-NEXT:    sb a6, 34(sp)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    sb a1, 33(sp)
 ; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 27(sp)
+; RV32I-NEXT:    sb a1, 31(sp)
 ; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 26(sp)
+; RV32I-NEXT:    sb a1, 30(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 25(sp)
+; RV32I-NEXT:    sb a5, 29(sp)
 ; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 23(sp)
+; RV32I-NEXT:    sb a1, 27(sp)
 ; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 22(sp)
+; RV32I-NEXT:    sb a1, 26(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
 ; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 19(sp)
+; RV32I-NEXT:    sb a1, 23(sp)
 ; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 18(sp)
+; RV32I-NEXT:    sb a1, 22(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 17(sp)
+; RV32I-NEXT:    sb a3, 21(sp)
 ; RV32I-NEXT:    slli a1, a2, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a1, a3, a1
-; RV32I-NEXT:    lbu a3, 5(a1)
-; RV32I-NEXT:    lbu a4, 4(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    addi a3, sp, 20
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 4(a3)
+; RV32I-NEXT:    lbu a4, 5(a3)
+; RV32I-NEXT:    lbu a5, 6(a3)
+; RV32I-NEXT:    lbu a6, 7(a3)
+; RV32I-NEXT:    lbu a7, 8(a3)
+; RV32I-NEXT:    lbu t0, 9(a3)
+; RV32I-NEXT:    lbu t1, 10(a3)
+; RV32I-NEXT:    lbu t2, 11(a3)
+; RV32I-NEXT:    lbu t3, 12(a3)
+; RV32I-NEXT:    lbu t4, 13(a3)
+; RV32I-NEXT:    lbu t5, 14(a3)
+; RV32I-NEXT:    lbu t6, 15(a3)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu s0, 0(a3)
+; RV32I-NEXT:    lbu s1, 1(a3)
+; RV32I-NEXT:    lbu s2, 2(a3)
+; RV32I-NEXT:    lbu a3, 3(a3)
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    sll a4, a3, a2
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu t0, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    srli a6, a5, 1
-; RV32I-NEXT:    xori a7, a2, 31
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 9(a1)
-; RV32I-NEXT:    lbu t0, 8(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    sll a4, a1, a2
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    srli a5, a3, 1
+; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a5, t0, a7
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    sll t0, a6, a2
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not t1, a2
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    lbu t0, 13(a1)
-; RV32I-NEXT:    lbu t1, 12(a1)
-; RV32I-NEXT:    lbu t2, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    srli a6, a6, 1
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    sll a2, a5, a2
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    not a7, a2
+; RV32I-NEXT:    srl a1, a1, a7
+; RV32I-NEXT:    sll a7, a5, a2
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    sll a7, a7, a2
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    sll a2, a3, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: shl128:
@@ -616,60 +634,60 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 8(a1)
 ; RV32I-NEXT:    lw t2, 0(a1)
+; RV32I-NEXT:    lw a3, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    andi t1, a2, 64
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    mv a4, t2
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:    beqz t1, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv t0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:  .LBB10_2:
 ; RV32I-NEXT:    andi a6, a2, 32
-; RV32I-NEXT:    mv a5, a4
+; RV32I-NEXT:    mv a5, a1
 ; RV32I-NEXT:    bnez a6, .LBB10_13
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    bnez t1, .LBB10_14
 ; RV32I-NEXT:  .LBB10_4:
 ; RV32I-NEXT:    beqz a6, .LBB10_6
 ; RV32I-NEXT:  .LBB10_5:
-; RV32I-NEXT:    mv t0, a3
+; RV32I-NEXT:    mv t0, a4
 ; RV32I-NEXT:  .LBB10_6:
 ; RV32I-NEXT:    slli t3, t0, 1
 ; RV32I-NEXT:    not t2, a2
 ; RV32I-NEXT:    beqz t1, .LBB10_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a1, a7
+; RV32I-NEXT:    mv a3, a7
 ; RV32I-NEXT:  .LBB10_8:
 ; RV32I-NEXT:    srl a7, a5, a2
 ; RV32I-NEXT:    sll t1, t3, t2
 ; RV32I-NEXT:    srl t0, t0, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a4, a3
 ; RV32I-NEXT:  .LBB10_10:
 ; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    slli t1, a3, 1
+; RV32I-NEXT:    slli t1, a4, 1
 ; RV32I-NEXT:    sll t1, t1, t2
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    srl a4, a4, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    slli a4, a1, 1
-; RV32I-NEXT:    sll a4, a4, t2
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    slli a1, a3, 1
+; RV32I-NEXT:    sll a1, a1, t2
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    srl a2, a3, a2
 ; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    sll a2, a5, t2
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sll a3, a5, t2
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    sw t0, 4(a0)
 ; RV32I-NEXT:    sw a7, 0(a0)
 ; RV32I-NEXT:    ret
@@ -677,7 +695,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    beqz t1, .LBB10_4
 ; RV32I-NEXT:  .LBB10_14:
-; RV32I-NEXT:    mv a3, t2
+; RV32I-NEXT:    mv a4, t2
 ; RV32I-NEXT:    bnez a6, .LBB10_5
 ; RV32I-NEXT:    j .LBB10_6
 ;
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 122388c1b73ec..fcaa1f7f238f6 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -310,22 +310,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lbu a0, 12(a0)
-; RV32-NEXT:    lw a1, 8(s0)
-; RV32-NEXT:    slli a2, a0, 30
+; RV32-NEXT:    lbu a1, 12(a0)
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    srli s1, a1, 2
-; RV32-NEXT:    or s1, s1, a2
-; RV32-NEXT:    slli a2, a1, 31
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    or s2, a4, a2
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    slli a0, a0, 31
-; RV32-NEXT:    srai s3, a0, 31
-; RV32-NEXT:    srli a1, a1, 1
-; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srli s1, a2, 2
+; RV32-NEXT:    or s1, s1, a0
+; RV32-NEXT:    slli a4, a2, 31
 ; RV32-NEXT:    lw a0, 0(s0)
-; RV32-NEXT:    srai s4, a1, 31
+; RV32-NEXT:    srli a5, a3, 1
+; RV32-NEXT:    or s2, a5, a4
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai s3, a1, 31
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a2, a2, 31
+; RV32-NEXT:    srai s4, a2, 31
 ; RV32-NEXT:    slli a1, a3, 31
 ; RV32-NEXT:    srai a1, a1, 31
 ; RV32-NEXT:    li a2, 6
@@ -462,22 +462,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    mv s0, a0
-; RV32M-NEXT:    lbu a0, 12(a0)
-; RV32M-NEXT:    lw a1, 8(s0)
-; RV32M-NEXT:    slli a2, a0, 30
+; RV32M-NEXT:    lbu a1, 12(a0)
+; RV32M-NEXT:    lw a2, 8(a0)
+; RV32M-NEXT:    slli a0, a1, 30
 ; RV32M-NEXT:    lw a3, 4(s0)
-; RV32M-NEXT:    srli s1, a1, 2
-; RV32M-NEXT:    or s1, s1, a2
-; RV32M-NEXT:    slli a2, a1, 31
-; RV32M-NEXT:    srli a4, a3, 1
-; RV32M-NEXT:    or s2, a4, a2
-; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    slli a0, a0, 31
-; RV32M-NEXT:    srai s3, a0, 31
-; RV32M-NEXT:    srli a1, a1, 1
-; RV32M-NEXT:    slli a1, a1, 31
+; RV32M-NEXT:    srli s1, a2, 2
+; RV32M-NEXT:    or s1, s1, a0
+; RV32M-NEXT:    slli a4, a2, 31
 ; RV32M-NEXT:    lw a0, 0(s0)
-; RV32M-NEXT:    srai s4, a1, 31
+; RV32M-NEXT:    srli a5, a3, 1
+; RV32M-NEXT:    or s2, a5, a4
+; RV32M-NEXT:    srli a1, a1, 2
+; RV32M-NEXT:    slli a1, a1, 31
+; RV32M-NEXT:    srai s3, a1, 31
+; RV32M-NEXT:    srli a2, a2, 1
+; RV32M-NEXT:    slli a2, a2, 31
+; RV32M-NEXT:    srai s4, a2, 31
 ; RV32M-NEXT:    slli a1, a3, 31
 ; RV32M-NEXT:    srai a1, a1, 31
 ; RV32M-NEXT:    li a2, 6
@@ -536,34 +536,34 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    ld a1, 0(a0)
 ; RV64M-NEXT:    lwu a2, 8(a0)
-; RV64M-NEXT:    srli a3, a1, 2
-; RV64M-NEXT:    lbu a4, 12(a0)
+; RV64M-NEXT:    lbu a3, 12(a0)
+; RV64M-NEXT:    srli a4, a1, 2
 ; RV64M-NEXT:    slli a5, a2, 62
-; RV64M-NEXT:    or a3, a5, a3
-; RV64M-NEXT:    srai a3, a3, 31
-; RV64M-NEXT:    slli a4, a4, 32
-; RV64M-NEXT:    or a2, a2, a4
+; RV64M-NEXT:    or a4, a5, a4
+; RV64M-NEXT:    srai a4, a4, 31
+; RV64M-NEXT:    slli a3, a3, 32
+; RV64M-NEXT:    or a2, a2, a3
 ; RV64M-NEXT:    slli a2, a2, 29
-; RV64M-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64M-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64M-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64M-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64M-NEXT:    srai a2, a2, 31
 ; RV64M-NEXT:    slli a1, a1, 31
 ; RV64M-NEXT:    srai a1, a1, 31
-; RV64M-NEXT:    mulh a4, a2, a4
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    mulh a3, a2, a3
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    slli a4, a4, 2
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    mulh a4, a3, a5
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    slli a5, a4, 3
-; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    slli a3, a3, 2
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    mulh a3, a4, a5
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
+; RV64M-NEXT:    slli a5, a3, 3
+; RV64M-NEXT:    add a3, a4, a3
 ; RV64M-NEXT:    sub a3, a3, a5
 ; RV64M-NEXT:    addi a3, a3, -1
 ; RV64M-NEXT:    seqz a3, a3
@@ -612,22 +612,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
 ; RV32MV-NEXT:    mv s0, a0
-; RV32MV-NEXT:    lbu a0, 12(a0)
-; RV32MV-NEXT:    lw a1, 8(s0)
-; RV32MV-NEXT:    slli a2, a0, 30
+; RV32MV-NEXT:    lbu a1, 12(a0)
+; RV32MV-NEXT:    lw a2, 8(a0)
+; RV32MV-NEXT:    slli a0, a1, 30
 ; RV32MV-NEXT:    lw a3, 4(s0)
-; RV32MV-NEXT:    srli s1, a1, 2
-; RV32MV-NEXT:    or s1, s1, a2
-; RV32MV-NEXT:    slli a2, a1, 31
-; RV32MV-NEXT:    srli a4, a3, 1
-; RV32MV-NEXT:    or s2, a4, a2
-; RV32MV-NEXT:    srli a0, a0, 2
-; RV32MV-NEXT:    slli a0, a0, 31
-; RV32MV-NEXT:    srai s3, a0, 31
-; RV32MV-NEXT:    srli a1, a1, 1
-; RV32MV-NEXT:    slli a1, a1, 31
+; RV32MV-NEXT:    srli s1, a2, 2
+; RV32MV-NEXT:    or s1, s1, a0
+; RV32MV-NEXT:    slli a4, a2, 31
 ; RV32MV-NEXT:    lw a0, 0(s0)
-; RV32MV-NEXT:    srai s4, a1, 31
+; RV32MV-NEXT:    srli a5, a3, 1
+; RV32MV-NEXT:    or s2, a5, a4
+; RV32MV-NEXT:    srli a1, a1, 2
+; RV32MV-NEXT:    slli a1, a1, 31
+; RV32MV-NEXT:    srai s3, a1, 31
+; RV32MV-NEXT:    srli a2, a2, 1
+; RV32MV-NEXT:    slli a2, a2, 31
+; RV32MV-NEXT:    srai s4, a2, 31
 ; RV32MV-NEXT:    slli a1, a3, 31
 ; RV32MV-NEXT:    srai a1, a1, 31
 ; RV32MV-NEXT:    li a2, 6
@@ -727,46 +727,46 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV:       # %bb.0:
 ; RV64MV-NEXT:    ld a1, 0(a0)
 ; RV64MV-NEXT:    lwu a2, 8(a0)
-; RV64MV-NEXT:    srli a3, a1, 2
-; RV64MV-NEXT:    lbu a4, 12(a0)
+; RV64MV-NEXT:    lbu a3, 12(a0)
+; RV64MV-NEXT:    srli a4, a1, 2
 ; RV64MV-NEXT:    slli a5, a2, 62
-; RV64MV-NEXT:    or a3, a5, a3
-; RV64MV-NEXT:    srai a3, a3, 31
-; RV64MV-NEXT:    slli a4, a4, 32
-; RV64MV-NEXT:    or a2, a2, a4
+; RV64MV-NEXT:    or a4, a5, a4
+; RV64MV-NEXT:    srai a4, a4, 31
+; RV64MV-NEXT:    slli a3, a3, 32
+; RV64MV-NEXT:    or a2, a2, a3
 ; RV64MV-NEXT:    slli a2, a2, 29
-; RV64MV-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64MV-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64MV-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64MV-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64MV-NEXT:    srai a2, a2, 31
 ; RV64MV-NEXT:    slli a1, a1, 31
 ; RV64MV-NEXT:    srai a1, a1, 31
-; RV64MV-NEXT:    mulh a4, a2, a4
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    mulh a3, a2, a3
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    slli a4, a4, 2
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    mulh a4, a3, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    slli a3, a3, 2
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    mulh a3, a4, a5
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_2)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT:    add a3, a3, a4
-; RV64MV-NEXT:    slli a4, a4, 3
-; RV64MV-NEXT:    sub a3, a3, a4
-; RV64MV-NEXT:    mulh a4, a1, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    add a4, a4, a3
+; RV64MV-NEXT:    slli a3, a3, 3
+; RV64MV-NEXT:    sub a4, a4, a3
+; RV64MV-NEXT:    mulh a3, a1, a5
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    li a5, 6
-; RV64MV-NEXT:    mul a4, a4, a5
-; RV64MV-NEXT:    sub a1, a1, a4
+; RV64MV-NEXT:    mul a3, a3, a5
+; RV64MV-NEXT:    sub a1, a1, a3
 ; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64MV-NEXT:    vmv.v.x v8, a1
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a3
+; RV64MV-NEXT:    vslide1down.vx v8, v8, a4
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64MV-NEXT:    li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 3335ca3a34b6c..091b7d229a06c 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -53,20 +53,20 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 12(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
 ; RV32IM-NEXT:    lh a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 507375
 ; RV32IM-NEXT:    addi a5, a5, 1981
 ; RV32IM-NEXT:    mulh a5, a1, a5
@@ -79,26 +79,26 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    lui a5, 342392
 ; RV32IM-NEXT:    addi a5, a5, 669
-; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    mulh a5, a4, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 5
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 780943
 ; RV32IM-NEXT:    addi a5, a5, 1809
-; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    mulh a5, a3, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 8
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_1:
@@ -241,20 +241,20 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 12(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
 ; RV32IM-NEXT:    lh a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    sub a2, a2, a6
 ; RV32IM-NEXT:    mulh a6, a1, a5
 ; RV32IM-NEXT:    add a6, a6, a1
 ; RV32IM-NEXT:    srli t0, a6, 31
@@ -262,24 +262,24 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    mulh a6, a3, a5
-; RV32IM-NEXT:    add a6, a6, a3
+; RV32IM-NEXT:    mulh a6, a4, a5
+; RV32IM-NEXT:    add a6, a6, a4
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_2:
@@ -445,14 +445,14 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_srem_sdiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 0(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
 ; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srai a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
@@ -464,30 +464,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    srai t1, t1, 6
 ; RV32IM-NEXT:    add t1, t1, t2
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulh t3, a3, a5
-; RV32IM-NEXT:    add t3, t3, a3
+; RV32IM-NEXT:    mulh t3, a4, a5
+; RV32IM-NEXT:    add t3, t3, a4
 ; RV32IM-NEXT:    srli t4, t3, 31
 ; RV32IM-NEXT:    srai t3, t3, 6
 ; RV32IM-NEXT:    add t3, t3, t4
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli t5, a5, 31
 ; RV32IM-NEXT:    srai a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, t5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_srem_sdiv:
@@ -655,36 +655,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 8(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
 ; RV32IM-NEXT:    lh a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    srli a5, a1, 26
 ; RV32IM-NEXT:    add a5, a1, a5
 ; RV32IM-NEXT:    andi a5, a5, -64
 ; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a3, 27
-; RV32IM-NEXT:    add a5, a3, a5
+; RV32IM-NEXT:    srli a5, a4, 27
+; RV32IM-NEXT:    add a5, a4, a5
 ; RV32IM-NEXT:    andi a5, a5, -32
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    srli a5, a2, 29
-; RV32IM-NEXT:    add a5, a2, a5
+; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    srli a5, a3, 29
+; RV32IM-NEXT:    add a5, a3, a5
 ; RV32IM-NEXT:    andi a5, a5, -8
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_power_of_two:
@@ -803,19 +803,19 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
+; RV32IM-NEXT:    lh a3, 12(a1)
 ; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a4, 820904
 ; RV32IM-NEXT:    addi a4, a4, -1903
-; RV32IM-NEXT:    mulh a4, a3, a4
-; RV32IM-NEXT:    add a4, a4, a3
+; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    add a4, a4, a2
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 9
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 729444
 ; RV32IM-NEXT:    addi a4, a4, 713
 ; RV32IM-NEXT:    mulh a4, a1, a4
@@ -828,18 +828,18 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    lui a4, 395996
 ; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    mulh a4, a3, a4
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 11
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a3, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_one:
@@ -1036,34 +1036,34 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lh a2, 16(a1)
 ; RV64IM-NEXT:    lui a3, %hi(.LCPI5_0)
 ; RV64IM-NEXT:    ld a3, %lo(.LCPI5_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
 ; RV64IM-NEXT:    mulh a3, a2, a3
 ; RV64IM-NEXT:    add a3, a3, a2
 ; RV64IM-NEXT:    srli a5, a3, 63
 ; RV64IM-NEXT:    srli a3, a3, 4
 ; RV64IM-NEXT:    add a3, a3, a5
-; RV64IM-NEXT:    li a5, 23
-; RV64IM-NEXT:    lui a6, %hi(.LCPI5_1)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI5_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    lh a1, 8(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
+; RV64IM-NEXT:    li a6, 23
+; RV64IM-NEXT:    mul a3, a3, a6
 ; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
+; RV64IM-NEXT:    mulh a3, a1, a5
 ; RV64IM-NEXT:    srli a5, a3, 63
 ; RV64IM-NEXT:    srli a3, a3, 11
 ; RV64IM-NEXT:    add a3, a3, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
 ; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    srli a3, a1, 49
-; RV64IM-NEXT:    add a3, a1, a3
+; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    srli a3, a4, 49
+; RV64IM-NEXT:    add a3, a4, a3
 ; RV64IM-NEXT:    lui a5, 8
 ; RV64IM-NEXT:    and a3, a3, a5
-; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    subw a4, a4, a3
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a2, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
@@ -1085,13 +1085,13 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
-; RV32I-NEXT:    lw s4, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
+; RV32I-NEXT:    lw s0, 28(a1)
+; RV32I-NEXT:    lw s1, 24(a1)
+; RV32I-NEXT:    lw s2, 20(a1)
+; RV32I-NEXT:    lw s3, 16(a1)
+; RV32I-NEXT:    lw s4, 12(a1)
 ; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw s5, 8(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
@@ -1101,23 +1101,23 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a0, s5
+; RV32I-NEXT:    mv a1, s4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
@@ -1154,13 +1154,13 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
-; RV32IM-NEXT:    lw s4, 8(a1)
-; RV32IM-NEXT:    lw s5, 12(a1)
+; RV32IM-NEXT:    lw s0, 28(a1)
+; RV32IM-NEXT:    lw s1, 24(a1)
+; RV32IM-NEXT:    lw s2, 20(a1)
+; RV32IM-NEXT:    lw s3, 16(a1)
+; RV32IM-NEXT:    lw s4, 12(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw s5, 8(a1)
 ; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
@@ -1170,23 +1170,23 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    mv a0, s5
+; RV32IM-NEXT:    mv a1, s4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s4, a0
 ; RV32IM-NEXT:    mv s5, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s2
-; RV32IM-NEXT:    mv a1, s3
+; RV32IM-NEXT:    mv a0, s3
+; RV32IM-NEXT:    mv a1, s2
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s2, a0
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s1
+; RV32IM-NEXT:    mv a1, s0
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 8c0d97afe6c21..1525804be545c 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,47 +10,47 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 12(a1)
-; RISCV32-NEXT:    lw a7, 12(a2)
-; RISCV32-NEXT:    lw a6, 8(a1)
-; RISCV32-NEXT:    lw a4, 0(a2)
-; RISCV32-NEXT:    lw a5, 0(a1)
+; RISCV32-NEXT:    lw a3, 0(a2)
+; RISCV32-NEXT:    lw a4, 12(a1)
+; RISCV32-NEXT:    lw a5, 8(a1)
 ; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw t0, 8(a2)
-; RISCV32-NEXT:    lw a2, 4(a2)
-; RISCV32-NEXT:    mulhu a1, a5, a4
-; RISCV32-NEXT:    mul t1, t2, a4
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, t2, a4
+; RISCV32-NEXT:    lw a1, 0(a1)
+; RISCV32-NEXT:    lw a6, 12(a2)
+; RISCV32-NEXT:    lw a7, 8(a2)
+; RISCV32-NEXT:    lw t0, 4(a2)
+; RISCV32-NEXT:    mulhu a2, a1, a3
+; RISCV32-NEXT:    mul t1, t2, a3
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, t2, a3
 ; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a5, a2
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, a5, a2
+; RISCV32-NEXT:    mul t1, a1, t0
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, a1, t0
 ; RISCV32-NEXT:    add t1, t3, t1
 ; RISCV32-NEXT:    add t5, t4, t1
-; RISCV32-NEXT:    mul t6, t2, a2
+; RISCV32-NEXT:    mul t6, t2, t0
 ; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, t0, a5
-; RISCV32-NEXT:    mul s3, a6, a4
+; RISCV32-NEXT:    mul t1, a7, a1
+; RISCV32-NEXT:    mul s3, a5, a3
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t3, t1, s0
 ; RISCV32-NEXT:    sltu s0, s0, t6
 ; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, t2, a2
+; RISCV32-NEXT:    mulhu t5, t2, t0
 ; RISCV32-NEXT:    add t4, t5, t4
 ; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, t0
-; RISCV32-NEXT:    mul t5, a7, a5
+; RISCV32-NEXT:    mul t4, t2, a7
+; RISCV32-NEXT:    mul t5, a6, a1
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, t0, a5
+; RISCV32-NEXT:    mulhu s1, a7, a1
 ; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, a2, a6
-; RISCV32-NEXT:    mul t5, a3, a4
+; RISCV32-NEXT:    mul t4, t0, a5
+; RISCV32-NEXT:    mul t5, a4, a3
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a6, a4
+; RISCV32-NEXT:    mulhu t5, a5, a3
 ; RISCV32-NEXT:    add t6, t5, t4
 ; RISCV32-NEXT:    add t4, t6, s2
 ; RISCV32-NEXT:    sltu s3, s4, s3
@@ -63,41 +63,41 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:  .LBB0_2: # %start
 ; RISCV32-NEXT:    sltu s0, s2, s1
 ; RISCV32-NEXT:    snez s1, t2
-; RISCV32-NEXT:    snez s2, a7
+; RISCV32-NEXT:    snez s2, a6
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a7, a5
+; RISCV32-NEXT:    mulhu s2, a6, a1
 ; RISCV32-NEXT:    snez s2, s2
 ; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, t0
+; RISCV32-NEXT:    mulhu t2, t2, a7
 ; RISCV32-NEXT:    snez t2, t2
 ; RISCV32-NEXT:    or t2, s1, t2
 ; RISCV32-NEXT:    or t2, t2, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    snez t6, a2
-; RISCV32-NEXT:    snez s0, a3
+; RISCV32-NEXT:    snez t6, t0
+; RISCV32-NEXT:    snez s0, a4
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a3, a4
+; RISCV32-NEXT:    mulhu s0, a4, a3
 ; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu a2, a2, a6
-; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    or a2, t6, a2
-; RISCV32-NEXT:    or a2, a2, t5
-; RISCV32-NEXT:    or a7, t0, a7
-; RISCV32-NEXT:    snez a7, a7
-; RISCV32-NEXT:    or a3, a6, a3
-; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    and a3, a3, a7
-; RISCV32-NEXT:    or a2, a3, a2
-; RISCV32-NEXT:    or a3, t2, t3
-; RISCV32-NEXT:    or a2, a2, a3
-; RISCV32-NEXT:    mul a3, a5, a4
-; RISCV32-NEXT:    andi a2, a2, 1
-; RISCV32-NEXT:    sw a3, 0(a0)
-; RISCV32-NEXT:    sw a1, 4(a0)
+; RISCV32-NEXT:    mulhu t0, t0, a5
+; RISCV32-NEXT:    snez t0, t0
+; RISCV32-NEXT:    or t0, t6, t0
+; RISCV32-NEXT:    or t0, t0, t5
+; RISCV32-NEXT:    or a6, a7, a6
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    or a4, a5, a4
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a4, a4, a6
+; RISCV32-NEXT:    or a4, a4, t0
+; RISCV32-NEXT:    or a5, t2, t3
+; RISCV32-NEXT:    or a4, a4, a5
+; RISCV32-NEXT:    mul a1, a1, a3
+; RISCV32-NEXT:    andi a4, a4, 1
+; RISCV32-NEXT:    sw a1, 0(a0)
+; RISCV32-NEXT:    sw a2, 4(a0)
 ; RISCV32-NEXT:    sw t1, 8(a0)
 ; RISCV32-NEXT:    sw t4, 12(a0)
-; RISCV32-NEXT:    sb a2, 16(a0)
+; RISCV32-NEXT:    sb a4, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index ce0d8fedbfb88..9cc835da3cc29 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -85,50 +85,49 @@ define i32 @load_i32(ptr %p) {
 define i64 @load_i64(ptr %p) {
 ; RV32I-LABEL: load_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 0(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a2, 1(a0)
 ; RV32I-NEXT:    lbu a3, 2(a0)
 ; RV32I-NEXT:    lbu a4, 3(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
+; RV32I-NEXT:    slli a2, a2, 8
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a2, a4, a3
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    or a0, a4, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a1, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a2, t0, a7
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: load_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 0(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a2, 1(a0)
 ; RV64I-NEXT:    lbu a3, 2(a0)
 ; RV64I-NEXT:    lbu a4, 3(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a2, a2, 8
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    lbu a2, 5(a0)
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a2, a2, 8
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a2, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 32aca29d16e9b..8fc4465ffab1f 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -54,16 +54,16 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 8456
 ; RV32IM-NEXT:    addi a5, a5, 1058
 ; RV32IM-NEXT:    mulhu a5, a1, a5
@@ -72,20 +72,20 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    lui a5, 10700
 ; RV32IM-NEXT:    addi a5, a5, -1003
-; RV32IM-NEXT:    mulhu a5, a3, a5
+; RV32IM-NEXT:    mulhu a5, a4, a5
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1045
 ; RV32IM-NEXT:    addi a5, a5, 1801
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    li a6, 1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_1:
@@ -214,29 +214,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    sub a2, a2, a6
 ; RV32IM-NEXT:    mulhu a6, a1, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    mulhu a6, a3, a5
+; RV32IM-NEXT:    mulhu a6, a4, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_2:
@@ -386,33 +386,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_urem_udiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 0(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 0(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul t0, a6, a7
 ; RV32IM-NEXT:    mulhu t1, a1, a5
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulhu t3, a3, a5
+; RV32IM-NEXT:    mulhu t3, a4, a5
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_urem_udiv:
@@ -531,10 +531,10 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu s3, 0(a1)
-; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
@@ -556,23 +556,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 8(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a3, a3, 31
-; RV32IM-NEXT:    andi a2, a2, 7
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    andi a4, a4, 31
+; RV32IM-NEXT:    andi a3, a3, 7
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_power_of_two:
@@ -583,10 +583,10 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu s3, 0(a1)
-; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
@@ -670,15 +670,15 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
+; RV32IM-NEXT:    lhu a3, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a4, 1603
 ; RV32IM-NEXT:    addi a4, a4, 1341
-; RV32IM-NEXT:    mulhu a4, a3, a4
+; RV32IM-NEXT:    mulhu a4, a2, a4
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 45590
 ; RV32IM-NEXT:    addi a4, a4, 1069
 ; RV32IM-NEXT:    mulhu a4, a1, a4
@@ -687,15 +687,15 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    lui a4, 193
 ; RV32IM-NEXT:    addi a4, a4, 1464
-; RV32IM-NEXT:    mulhu a4, a2, a4
+; RV32IM-NEXT:    mulhu a4, a3, a4
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a3, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_one:
@@ -791,13 +791,13 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
-; RV32I-NEXT:    lw s4, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
+; RV32I-NEXT:    lw s0, 28(a1)
+; RV32I-NEXT:    lw s1, 24(a1)
+; RV32I-NEXT:    lw s2, 20(a1)
+; RV32I-NEXT:    lw s3, 16(a1)
+; RV32I-NEXT:    lw s4, 12(a1)
 ; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw s5, 8(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
@@ -807,23 +807,23 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a0, s5
+; RV32I-NEXT:    mv a1, s4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
@@ -860,13 +860,13 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
-; RV32IM-NEXT:    lw s4, 8(a1)
-; RV32IM-NEXT:    lw s5, 12(a1)
+; RV32IM-NEXT:    lw s0, 28(a1)
+; RV32IM-NEXT:    lw s1, 24(a1)
+; RV32IM-NEXT:    lw s2, 20(a1)
+; RV32IM-NEXT:    lw s3, 16(a1)
+; RV32IM-NEXT:    lw s4, 12(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw s5, 8(a1)
 ; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
@@ -876,23 +876,23 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    mv a0, s5
+; RV32IM-NEXT:    mv a1, s4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s4, a0
 ; RV32IM-NEXT:    mv s5, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s2
-; RV32IM-NEXT:    mv a1, s3
+; RV32IM-NEXT:    mv a0, s3
+; RV32IM-NEXT:    mv a1, s2
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s2, a0
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s1
+; RV32IM-NEXT:    mv a1, s0
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92b..ed5a522a8a746 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -38,17 +38,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -102,17 +102,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -136,8 +136,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -166,17 +166,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -198,47 +198,47 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -272,17 +272,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
@@ -334,47 +334,47 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -408,17 +408,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
@@ -470,47 +470,47 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -544,18 +544,18 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    or a5, a1, a6
+; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -607,47 +607,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -659,25 +659,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
 ; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -779,38 +779,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    lbu a3, 14(a0)
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 8(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 5(a0)
+; RV32I-NEXT:    lbu t5, 4(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s0, 2(a0)
+; RV32I-NEXT:    lbu s1, 1(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    sb a7, 10(a2)
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb s0, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb s1, 1(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t2, 7(a2)
+; RV32I-NEXT:    sb t5, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -826,47 +826,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -878,25 +878,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
 ; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -998,38 +998,38 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 20
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    lbu a3, 14(a0)
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 8(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 5(a0)
+; RV32I-NEXT:    lbu t5, 4(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s0, 2(a0)
+; RV32I-NEXT:    lbu s1, 1(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    sb a7, 10(a2)
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb s0, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb s1, 1(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t2, 7(a2)
+; RV32I-NEXT:    sb t5, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1045,47 +1045,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 5(a1)
 ; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 6(a1)
 ; RV64I-NEXT:    lbu t0, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    lbu t1, 0(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a5, a5, 35
@@ -1099,25 +1099,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
 ; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t3
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
@@ -1167,94 +1167,94 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu a0, 13(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu a0, 14(a0)
+; RV32I-NEXT:    slli s3, a3, 24
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sb a3, 15(sp)
-; RV32I-NEXT:    sb s3, 14(sp)
-; RV32I-NEXT:    sb a0, 13(sp)
-; RV32I-NEXT:    sb s2, 12(sp)
-; RV32I-NEXT:    sb s1, 11(sp)
-; RV32I-NEXT:    sb s0, 10(sp)
-; RV32I-NEXT:    sb t6, 9(sp)
-; RV32I-NEXT:    sb t5, 8(sp)
-; RV32I-NEXT:    sb t4, 7(sp)
-; RV32I-NEXT:    sb t3, 6(sp)
-; RV32I-NEXT:    sb t2, 5(sp)
-; RV32I-NEXT:    sb t1, 4(sp)
-; RV32I-NEXT:    sb t0, 3(sp)
-; RV32I-NEXT:    sb a7, 2(sp)
-; RV32I-NEXT:    sb a6, 1(sp)
-; RV32I-NEXT:    sb a5, 0(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a4, 16(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a0, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    sb a0, 14(sp)
+; RV32I-NEXT:    sb s2, 13(sp)
+; RV32I-NEXT:    sb s1, 12(sp)
+; RV32I-NEXT:    sb s0, 11(sp)
+; RV32I-NEXT:    sb t6, 10(sp)
+; RV32I-NEXT:    sb t5, 9(sp)
+; RV32I-NEXT:    sb t4, 8(sp)
+; RV32I-NEXT:    sb t3, 7(sp)
+; RV32I-NEXT:    sb t2, 6(sp)
+; RV32I-NEXT:    sb t1, 5(sp)
+; RV32I-NEXT:    sb t0, 4(sp)
+; RV32I-NEXT:    sb a7, 3(sp)
+; RV32I-NEXT:    sb a6, 2(sp)
+; RV32I-NEXT:    sb a5, 1(sp)
+; RV32I-NEXT:    sb a4, 0(sp)
+; RV32I-NEXT:    srai a0, s3, 31
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    sb a0, 24(sp)
+; RV32I-NEXT:    sb a0, 20(sp)
+; RV32I-NEXT:    sb a0, 16(sp)
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a0, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a0, 21(sp)
+; RV32I-NEXT:    sb a3, 19(sp)
+; RV32I-NEXT:    sb a4, 18(sp)
+; RV32I-NEXT:    sb a0, 17(sp)
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    lbu a3, 14(a0)
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 8(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 5(a0)
+; RV32I-NEXT:    lbu t5, 4(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s0, 2(a0)
+; RV32I-NEXT:    lbu s1, 1(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    sb a7, 10(a2)
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb s0, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb s1, 1(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t2, 7(a2)
+; RV32I-NEXT:    sb t5, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1286,18 +1286,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a5, a1
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu a6, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1318,19 +1321,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    lbu ra, 24(a0)
 ; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
 ; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 87(sp)
-; RV64I-NEXT:    sb a4, 86(sp)
+; RV64I-NEXT:    lbu a5, 0(a5)
+; RV64I-NEXT:    sb a6, 87(sp)
+; RV64I-NEXT:    sb a7, 86(sp)
 ; RV64I-NEXT:    sb a0, 85(sp)
-; RV64I-NEXT:    sb a5, 84(sp)
-; RV64I-NEXT:    sb a6, 83(sp)
-; RV64I-NEXT:    sb a7, 82(sp)
+; RV64I-NEXT:    sb a1, 84(sp)
+; RV64I-NEXT:    sb a3, 83(sp)
+; RV64I-NEXT:    sb a4, 82(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1395,83 +1396,83 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 57(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    add a0, a0, a5
+; RV64I-NEXT:    lbu t5, 25(a0)
+; RV64I-NEXT:    lbu t4, 26(a0)
+; RV64I-NEXT:    lbu t3, 27(a0)
+; RV64I-NEXT:    lbu t2, 28(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 10(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 11(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 13(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t6, 14(a0)
+; RV64I-NEXT:    lbu s0, 15(a0)
+; RV64I-NEXT:    lbu s1, 16(a0)
+; RV64I-NEXT:    lbu s2, 17(a0)
+; RV64I-NEXT:    lbu s3, 18(a0)
+; RV64I-NEXT:    lbu s4, 19(a0)
+; RV64I-NEXT:    lbu s5, 20(a0)
+; RV64I-NEXT:    lbu s6, 21(a0)
+; RV64I-NEXT:    lbu s7, 22(a0)
+; RV64I-NEXT:    lbu s8, 24(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
+; RV64I-NEXT:    lbu s10, 0(a0)
+; RV64I-NEXT:    lbu s11, 1(a0)
+; RV64I-NEXT:    lbu ra, 2(a0)
+; RV64I-NEXT:    lbu a5, 3(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a1, 6(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    sb s9, 23(a2)
+; RV64I-NEXT:    sb s7, 22(a2)
+; RV64I-NEXT:    sb s6, 21(a2)
+; RV64I-NEXT:    sb s5, 20(a2)
+; RV64I-NEXT:    sb s4, 19(a2)
+; RV64I-NEXT:    sb s3, 18(a2)
+; RV64I-NEXT:    sb s2, 17(a2)
+; RV64I-NEXT:    sb s1, 16(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a7, 30(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t2, 28(a2)
+; RV64I-NEXT:    sb t3, 27(a2)
+; RV64I-NEXT:    sb t4, 26(a2)
+; RV64I-NEXT:    sb t5, 25(a2)
+; RV64I-NEXT:    sb s8, 24(a2)
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb ra, 2(a2)
+; RV64I-NEXT:    sb s11, 1(a2)
+; RV64I-NEXT:    sb s10, 0(a2)
+; RV64I-NEXT:    sb s0, 15(a2)
+; RV64I-NEXT:    sb t6, 14(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb a0, 13(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 11(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a0, 10(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a6, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1504,18 +1505,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1536,19 +1540,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    lbu ra, 24(a0)
 ; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
 ; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 59(sp)
-; RV32I-NEXT:    sb a4, 58(sp)
+; RV32I-NEXT:    lbu a5, 0(a5)
+; RV32I-NEXT:    sb a6, 59(sp)
+; RV32I-NEXT:    sb a7, 58(sp)
 ; RV32I-NEXT:    sb a0, 57(sp)
-; RV32I-NEXT:    sb a5, 56(sp)
-; RV32I-NEXT:    sb a6, 55(sp)
-; RV32I-NEXT:    sb a7, 54(sp)
+; RV32I-NEXT:    sb a1, 56(sp)
+; RV32I-NEXT:    sb a3, 55(sp)
+; RV32I-NEXT:    sb a4, 54(sp)
 ; RV32I-NEXT:    sb zero, 91(sp)
 ; RV32I-NEXT:    sb zero, 90(sp)
 ; RV32I-NEXT:    sb zero, 89(sp)
@@ -1613,83 +1615,83 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 28
-; RV32I-NEXT:    add a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    andi a0, a5, 31
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a1, 23(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 24(a0)
+; RV32I-NEXT:    lbu t1, 26(a0)
+; RV32I-NEXT:    lbu t2, 25(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 19(a0)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 16(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 17(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a1, 7(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s9, 8(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 9(a0)
+; RV32I-NEXT:    lbu ra, 6(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb s0, 29(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    sb s3, 17(a2)
+; RV32I-NEXT:    sb s1, 16(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    sb s2, 18(a2)
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    sb s8, 22(a2)
+; RV32I-NEXT:    sb s11, 9(a2)
+; RV32I-NEXT:    sb s9, 8(a2)
+; RV32I-NEXT:    sb s7, 11(a2)
+; RV32I-NEXT:    sb s10, 10(a2)
+; RV32I-NEXT:    sb s5, 13(a2)
+; RV32I-NEXT:    sb s6, 12(a2)
+; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    sb s4, 14(a2)
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb ra, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
@@ -1729,18 +1731,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a5, a1
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu a6, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1761,19 +1766,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    lbu ra, 24(a0)
 ; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
 ; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 119(sp)
-; RV64I-NEXT:    sb a4, 118(sp)
+; RV64I-NEXT:    lbu a5, 0(a5)
+; RV64I-NEXT:    sb a6, 119(sp)
+; RV64I-NEXT:    sb a7, 118(sp)
 ; RV64I-NEXT:    sb a0, 117(sp)
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb a1, 116(sp)
+; RV64I-NEXT:    sb a3, 115(sp)
+; RV64I-NEXT:    sb a4, 114(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -1838,83 +1841,83 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 89(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    sub a0, a0, a5
+; RV64I-NEXT:    lbu t5, 25(a0)
+; RV64I-NEXT:    lbu t4, 26(a0)
+; RV64I-NEXT:    lbu t3, 27(a0)
+; RV64I-NEXT:    lbu t2, 28(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 10(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 11(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 13(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t6, 14(a0)
+; RV64I-NEXT:    lbu s0, 15(a0)
+; RV64I-NEXT:    lbu s1, 16(a0)
+; RV64I-NEXT:    lbu s2, 17(a0)
+; RV64I-NEXT:    lbu s3, 18(a0)
+; RV64I-NEXT:    lbu s4, 19(a0)
+; RV64I-NEXT:    lbu s5, 20(a0)
+; RV64I-NEXT:    lbu s6, 21(a0)
+; RV64I-NEXT:    lbu s7, 22(a0)
+; RV64I-NEXT:    lbu s8, 24(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
+; RV64I-NEXT:    lbu s10, 0(a0)
+; RV64I-NEXT:    lbu s11, 1(a0)
+; RV64I-NEXT:    lbu ra, 2(a0)
+; RV64I-NEXT:    lbu a5, 3(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a1, 6(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    sb s9, 23(a2)
+; RV64I-NEXT:    sb s7, 22(a2)
+; RV64I-NEXT:    sb s6, 21(a2)
+; RV64I-NEXT:    sb s5, 20(a2)
+; RV64I-NEXT:    sb s4, 19(a2)
+; RV64I-NEXT:    sb s3, 18(a2)
+; RV64I-NEXT:    sb s2, 17(a2)
+; RV64I-NEXT:    sb s1, 16(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a7, 30(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t2, 28(a2)
+; RV64I-NEXT:    sb t3, 27(a2)
+; RV64I-NEXT:    sb t4, 26(a2)
+; RV64I-NEXT:    sb t5, 25(a2)
+; RV64I-NEXT:    sb s8, 24(a2)
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb ra, 2(a2)
+; RV64I-NEXT:    sb s11, 1(a2)
+; RV64I-NEXT:    sb s10, 0(a2)
+; RV64I-NEXT:    sb s0, 15(a2)
+; RV64I-NEXT:    sb t6, 14(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb a0, 13(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 11(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a0, 10(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a6, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1947,18 +1950,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1979,19 +1985,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    lbu ra, 24(a0)
 ; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
 ; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 91(sp)
-; RV32I-NEXT:    sb a4, 90(sp)
+; RV32I-NEXT:    lbu a5, 0(a5)
+; RV32I-NEXT:    sb a6, 91(sp)
+; RV32I-NEXT:    sb a7, 90(sp)
 ; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a5, 88(sp)
-; RV32I-NEXT:    sb a6, 87(sp)
-; RV32I-NEXT:    sb a7, 86(sp)
+; RV32I-NEXT:    sb a1, 88(sp)
+; RV32I-NEXT:    sb a3, 87(sp)
+; RV32I-NEXT:    sb a4, 86(sp)
 ; RV32I-NEXT:    sb zero, 59(sp)
 ; RV32I-NEXT:    sb zero, 58(sp)
 ; RV32I-NEXT:    sb zero, 57(sp)
@@ -2056,83 +2060,83 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 61(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    andi a1, a1, 31
+; RV32I-NEXT:    andi a5, a5, 31
 ; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sub a0, a0, a5
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a1, 23(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 24(a0)
+; RV32I-NEXT:    lbu t1, 26(a0)
+; RV32I-NEXT:    lbu t2, 25(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 19(a0)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 16(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 17(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a1, 7(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s9, 8(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 9(a0)
+; RV32I-NEXT:    lbu ra, 6(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb s0, 29(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    sb s3, 17(a2)
+; RV32I-NEXT:    sb s1, 16(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    sb s2, 18(a2)
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    sb s8, 22(a2)
+; RV32I-NEXT:    sb s11, 9(a2)
+; RV32I-NEXT:    sb s9, 8(a2)
+; RV32I-NEXT:    sb s7, 11(a2)
+; RV32I-NEXT:    sb s10, 10(a2)
+; RV32I-NEXT:    sb s5, 13(a2)
+; RV32I-NEXT:    sb s6, 12(a2)
+; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    sb s4, 14(a2)
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb ra, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
@@ -2172,86 +2176,87 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t0, a1
-; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    mv t3, a1
+; RV64I-NEXT:    lbu t2, 29(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
 ; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    lbu a1, 2(a0)
 ; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    lbu a1, 3(a0)
 ; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    lbu a1, 4(a0)
 ; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    lbu t4, 8(a0)
-; RV64I-NEXT:    lbu t5, 9(a0)
-; RV64I-NEXT:    lbu t6, 10(a0)
-; RV64I-NEXT:    lbu s0, 11(a0)
-; RV64I-NEXT:    lbu s1, 12(a0)
-; RV64I-NEXT:    lbu s2, 13(a0)
-; RV64I-NEXT:    lbu s3, 14(a0)
-; RV64I-NEXT:    lbu s4, 15(a0)
-; RV64I-NEXT:    lbu s5, 16(a0)
-; RV64I-NEXT:    lbu s6, 17(a0)
-; RV64I-NEXT:    lbu s7, 18(a0)
-; RV64I-NEXT:    lbu s8, 19(a0)
-; RV64I-NEXT:    lbu s9, 20(a0)
-; RV64I-NEXT:    lbu s10, 21(a0)
-; RV64I-NEXT:    lbu s11, 22(a0)
-; RV64I-NEXT:    lbu ra, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a4, 27(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu t5, 7(a0)
+; RV64I-NEXT:    lbu t6, 8(a0)
+; RV64I-NEXT:    lbu s0, 9(a0)
+; RV64I-NEXT:    lbu s1, 10(a0)
+; RV64I-NEXT:    lbu s2, 11(a0)
+; RV64I-NEXT:    lbu s3, 12(a0)
+; RV64I-NEXT:    lbu s4, 13(a0)
+; RV64I-NEXT:    lbu s5, 14(a0)
+; RV64I-NEXT:    lbu s6, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu s11, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu a6, 23(a0)
+; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t0, 0(t0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
+; RV64I-NEXT:    lbu t3, 0(t3)
+; RV64I-NEXT:    sd t3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sb t1, 86(sp)
+; RV64I-NEXT:    sb t2, 85(sp)
 ; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a4, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb ra, 79(sp)
-; RV64I-NEXT:    sb s11, 78(sp)
-; RV64I-NEXT:    sb s10, 77(sp)
-; RV64I-NEXT:    sb s9, 76(sp)
-; RV64I-NEXT:    sb s8, 75(sp)
-; RV64I-NEXT:    sb s7, 74(sp)
-; RV64I-NEXT:    sb s6, 73(sp)
-; RV64I-NEXT:    sb s5, 72(sp)
-; RV64I-NEXT:    sb s4, 71(sp)
-; RV64I-NEXT:    sb s3, 70(sp)
-; RV64I-NEXT:    sb s2, 69(sp)
-; RV64I-NEXT:    sb s1, 68(sp)
-; RV64I-NEXT:    sb s0, 67(sp)
-; RV64I-NEXT:    sb t6, 66(sp)
-; RV64I-NEXT:    sb t5, 65(sp)
-; RV64I-NEXT:    sb t4, 64(sp)
-; RV64I-NEXT:    sb t3, 63(sp)
-; RV64I-NEXT:    sb t2, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 83(sp)
+; RV64I-NEXT:    sb a3, 82(sp)
+; RV64I-NEXT:    sb a4, 81(sp)
+; RV64I-NEXT:    sb t0, 87(sp)
+; RV64I-NEXT:    slli t0, t0, 56
+; RV64I-NEXT:    sb a5, 80(sp)
+; RV64I-NEXT:    sb a6, 79(sp)
+; RV64I-NEXT:    sb a7, 78(sp)
+; RV64I-NEXT:    sb ra, 77(sp)
+; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    sb s10, 75(sp)
+; RV64I-NEXT:    sb s9, 74(sp)
+; RV64I-NEXT:    sb s8, 73(sp)
+; RV64I-NEXT:    sb s7, 72(sp)
+; RV64I-NEXT:    sb s6, 71(sp)
+; RV64I-NEXT:    sb s5, 70(sp)
+; RV64I-NEXT:    sb s4, 69(sp)
+; RV64I-NEXT:    sb s3, 68(sp)
+; RV64I-NEXT:    sb s2, 67(sp)
+; RV64I-NEXT:    sb s1, 66(sp)
+; RV64I-NEXT:    sb s0, 65(sp)
+; RV64I-NEXT:    sb t6, 64(sp)
+; RV64I-NEXT:    sb t5, 63(sp)
+; RV64I-NEXT:    sb t4, 62(sp)
+; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
+; RV64I-NEXT:    srai a0, t0, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2291,83 +2296,84 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    andi a0, t0, 31
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    andi a0, a0, 31
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a6, a1, a0
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu t5, 25(a0)
+; RV64I-NEXT:    lbu t4, 26(a0)
+; RV64I-NEXT:    lbu t3, 27(a0)
+; RV64I-NEXT:    lbu t2, 28(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 10(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 11(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 13(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t6, 14(a0)
+; RV64I-NEXT:    lbu s0, 15(a0)
+; RV64I-NEXT:    lbu s1, 16(a0)
+; RV64I-NEXT:    lbu s2, 17(a0)
+; RV64I-NEXT:    lbu s3, 18(a0)
+; RV64I-NEXT:    lbu s4, 19(a0)
+; RV64I-NEXT:    lbu s5, 20(a0)
+; RV64I-NEXT:    lbu s6, 21(a0)
+; RV64I-NEXT:    lbu s7, 22(a0)
+; RV64I-NEXT:    lbu s8, 24(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
+; RV64I-NEXT:    lbu s10, 0(a0)
+; RV64I-NEXT:    lbu s11, 1(a0)
+; RV64I-NEXT:    lbu ra, 2(a0)
+; RV64I-NEXT:    lbu a5, 3(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a1, 6(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    sb s9, 23(a2)
+; RV64I-NEXT:    sb s7, 22(a2)
+; RV64I-NEXT:    sb s6, 21(a2)
+; RV64I-NEXT:    sb s5, 20(a2)
+; RV64I-NEXT:    sb s4, 19(a2)
+; RV64I-NEXT:    sb s3, 18(a2)
+; RV64I-NEXT:    sb s2, 17(a2)
+; RV64I-NEXT:    sb s1, 16(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a7, 30(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t2, 28(a2)
+; RV64I-NEXT:    sb t3, 27(a2)
+; RV64I-NEXT:    sb t4, 26(a2)
+; RV64I-NEXT:    sb t5, 25(a2)
+; RV64I-NEXT:    sb s8, 24(a2)
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb ra, 2(a2)
+; RV64I-NEXT:    sb s11, 1(a2)
+; RV64I-NEXT:    sb s10, 0(a2)
+; RV64I-NEXT:    sb s0, 15(a2)
+; RV64I-NEXT:    sb t6, 14(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb a0, 13(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 11(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a0, 10(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a6, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -2400,86 +2406,87 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    lbu t1, 31(a0)
+; RV32I-NEXT:    mv t3, a1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t0, 31(a0)
+; RV32I-NEXT:    lbu t1, 30(a0)
 ; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
 ; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a1, 3(a0)
 ; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    lbu a1, 4(a0)
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t3, 7(a0)
-; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s6, 17(a0)
-; RV32I-NEXT:    lbu s7, 18(a0)
-; RV32I-NEXT:    lbu s8, 19(a0)
-; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 21(a0)
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a4, 27(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu t5, 7(a0)
+; RV32I-NEXT:    lbu t6, 8(a0)
+; RV32I-NEXT:    lbu s0, 9(a0)
+; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s2, 11(a0)
+; RV32I-NEXT:    lbu s3, 12(a0)
+; RV32I-NEXT:    lbu s4, 13(a0)
+; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s7, 16(a0)
+; RV32I-NEXT:    lbu s8, 17(a0)
+; RV32I-NEXT:    lbu s9, 18(a0)
+; RV32I-NEXT:    lbu s10, 19(a0)
+; RV32I-NEXT:    lbu s11, 20(a0)
+; RV32I-NEXT:    lbu ra, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu a6, 23(a0)
+; RV32I-NEXT:    lbu a5, 24(a0)
+; RV32I-NEXT:    lbu a4, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t0, 0(t0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
+; RV32I-NEXT:    lbu t3, 0(t3)
+; RV32I-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sb t1, 58(sp)
+; RV32I-NEXT:    sb t2, 57(sp)
 ; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a4, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb t1, 59(sp)
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s10, 49(sp)
-; RV32I-NEXT:    sb s9, 48(sp)
-; RV32I-NEXT:    sb s8, 47(sp)
-; RV32I-NEXT:    sb s7, 46(sp)
-; RV32I-NEXT:    sb s6, 45(sp)
-; RV32I-NEXT:    sb s5, 44(sp)
-; RV32I-NEXT:    sb s4, 43(sp)
-; RV32I-NEXT:    sb s3, 42(sp)
-; RV32I-NEXT:    sb s2, 41(sp)
-; RV32I-NEXT:    sb s1, 40(sp)
-; RV32I-NEXT:    sb s0, 39(sp)
-; RV32I-NEXT:    sb t6, 38(sp)
-; RV32I-NEXT:    sb t5, 37(sp)
-; RV32I-NEXT:    sb t4, 36(sp)
-; RV32I-NEXT:    sb t3, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 55(sp)
+; RV32I-NEXT:    sb a3, 54(sp)
+; RV32I-NEXT:    sb a4, 53(sp)
+; RV32I-NEXT:    sb t0, 59(sp)
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    sb a5, 52(sp)
+; RV32I-NEXT:    sb a6, 51(sp)
+; RV32I-NEXT:    sb a7, 50(sp)
+; RV32I-NEXT:    sb ra, 49(sp)
+; RV32I-NEXT:    sb s11, 48(sp)
+; RV32I-NEXT:    sb s10, 47(sp)
+; RV32I-NEXT:    sb s9, 46(sp)
+; RV32I-NEXT:    sb s8, 45(sp)
+; RV32I-NEXT:    sb s7, 44(sp)
+; RV32I-NEXT:    sb s6, 43(sp)
+; RV32I-NEXT:    sb s5, 42(sp)
+; RV32I-NEXT:    sb s4, 41(sp)
+; RV32I-NEXT:    sb s3, 40(sp)
+; RV32I-NEXT:    sb s2, 39(sp)
+; RV32I-NEXT:    sb s1, 38(sp)
+; RV32I-NEXT:    sb s0, 37(sp)
+; RV32I-NEXT:    sb t6, 36(sp)
+; RV32I-NEXT:    sb t5, 35(sp)
+; RV32I-NEXT:    sb t4, 34(sp)
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t1, 31
+; RV32I-NEXT:    srai a0, t0, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -2515,83 +2522,84 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 63(sp)
 ; RV32I-NEXT:    sb a3, 62(sp)
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    andi a0, t0, 31
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi a0, a0, 31
 ; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a6, a1, a0
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a1, 23(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 24(a0)
+; RV32I-NEXT:    lbu t1, 26(a0)
+; RV32I-NEXT:    lbu t2, 25(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 19(a0)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 16(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 17(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a1, 7(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s9, 8(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 9(a0)
+; RV32I-NEXT:    lbu ra, 6(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb s0, 29(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    sb s3, 17(a2)
+; RV32I-NEXT:    sb s1, 16(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    sb s2, 18(a2)
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    sb s8, 22(a2)
+; RV32I-NEXT:    sb s11, 9(a2)
+; RV32I-NEXT:    sb s9, 8(a2)
+; RV32I-NEXT:    sb s7, 11(a2)
+; RV32I-NEXT:    sb s10, 10(a2)
+; RV32I-NEXT:    sb s5, 13(a2)
+; RV32I-NEXT:    sb s6, 12(a2)
+; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    sb s4, 14(a2)
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb ra, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afa..c80c3e6834f67 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -37,17 +37,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -69,8 +69,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -98,17 +98,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -130,8 +130,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -159,17 +159,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -189,47 +189,47 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -262,17 +262,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB3_2
@@ -322,47 +322,47 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -395,17 +395,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB4_2
@@ -455,47 +455,47 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -528,17 +528,17 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a5, a1, a6
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -589,47 +589,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
@@ -640,25 +640,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
 ; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -710,36 +710,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu s2, 12(a0)
 ; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
+; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 39(sp)
 ; RV32I-NEXT:    sb zero, 38(sp)
 ; RV32I-NEXT:    sb zero, 37(sp)
@@ -752,115 +749,120 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb s4, 26(sp)
-; RV32I-NEXT:    sb s3, 25(sp)
-; RV32I-NEXT:    sb s2, 24(sp)
-; RV32I-NEXT:    sb t6, 23(sp)
-; RV32I-NEXT:    sb t5, 22(sp)
-; RV32I-NEXT:    sb t4, 21(sp)
-; RV32I-NEXT:    sb t3, 20(sp)
-; RV32I-NEXT:    sb t2, 19(sp)
-; RV32I-NEXT:    sb t1, 18(sp)
-; RV32I-NEXT:    sb t0, 17(sp)
-; RV32I-NEXT:    sb a7, 16(sp)
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    sb a5, 14(sp)
-; RV32I-NEXT:    sb a4, 13(sp)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 12
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    sb zero, 27(sp)
+; RV32I-NEXT:    sb zero, 26(sp)
+; RV32I-NEXT:    sb zero, 25(sp)
+; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    sb s5, 23(sp)
+; RV32I-NEXT:    sb s4, 22(sp)
+; RV32I-NEXT:    sb s3, 21(sp)
+; RV32I-NEXT:    sb s2, 20(sp)
+; RV32I-NEXT:    sb s1, 19(sp)
+; RV32I-NEXT:    sb s0, 18(sp)
+; RV32I-NEXT:    sb t6, 17(sp)
+; RV32I-NEXT:    sb t5, 16(sp)
+; RV32I-NEXT:    sb t4, 15(sp)
+; RV32I-NEXT:    sb t3, 14(sp)
+; RV32I-NEXT:    sb t2, 13(sp)
+; RV32I-NEXT:    sb t1, 12(sp)
+; RV32I-NEXT:    sb t0, 11(sp)
+; RV32I-NEXT:    sb a7, 10(sp)
+; RV32I-NEXT:    sb a6, 9(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 8
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 4(a1)
+; RV32I-NEXT:    lbu a4, 5(a1)
+; RV32I-NEXT:    lbu a5, 6(a1)
+; RV32I-NEXT:    lbu a6, 7(a1)
+; RV32I-NEXT:    lbu a7, 8(a1)
+; RV32I-NEXT:    lbu t0, 9(a1)
+; RV32I-NEXT:    lbu t1, 10(a1)
+; RV32I-NEXT:    lbu t2, 11(a1)
+; RV32I-NEXT:    lbu t3, 12(a1)
+; RV32I-NEXT:    lbu t4, 13(a1)
+; RV32I-NEXT:    lbu t5, 14(a1)
+; RV32I-NEXT:    lbu t6, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu s0, 0(a1)
+; RV32I-NEXT:    lbu s1, 1(a1)
+; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl a4, a3, a0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    not a7, a0
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    srl a1, a1, a0
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    xori a7, a0, 31
+; RV32I-NEXT:    sll a3, a3, a7
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    sll a7, t1, a7
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    srl a0, t0, a0
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli t0, a5, 16
+; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a5, 14(a2)
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    sb a5, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    srli a0, a7, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -872,47 +874,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
@@ -923,25 +925,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
 ; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -993,36 +995,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu s2, 12(a0)
 ; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 23(sp)
 ; RV32I-NEXT:    sb zero, 22(sp)
 ; RV32I-NEXT:    sb zero, 21(sp)
@@ -1035,115 +1034,120 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 14(sp)
 ; RV32I-NEXT:    sb zero, 13(sp)
 ; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb a0, 43(sp)
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    sb t0, 33(sp)
-; RV32I-NEXT:    sb a7, 32(sp)
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    sb a5, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a3, 28(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 28
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    sll a0, a5, a4
-; RV32I-NEXT:    lbu a1, 1(a3)
-; RV32I-NEXT:    lbu a6, 0(a3)
-; RV32I-NEXT:    lbu a7, 2(a3)
-; RV32I-NEXT:    lbu t0, 3(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    xori a7, a4, 31
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu t0, 13(a3)
-; RV32I-NEXT:    lbu t1, 12(a3)
-; RV32I-NEXT:    lbu t2, 14(a3)
-; RV32I-NEXT:    lbu t3, 15(a3)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t0, t0, a4
-; RV32I-NEXT:    lbu t1, 9(a3)
+; RV32I-NEXT:    sb zero, 11(sp)
+; RV32I-NEXT:    sb zero, 10(sp)
+; RV32I-NEXT:    sb zero, 9(sp)
+; RV32I-NEXT:    sb zero, 8(sp)
+; RV32I-NEXT:    sb s5, 39(sp)
+; RV32I-NEXT:    sb s4, 38(sp)
+; RV32I-NEXT:    sb s3, 37(sp)
+; RV32I-NEXT:    sb s2, 36(sp)
+; RV32I-NEXT:    sb s1, 35(sp)
+; RV32I-NEXT:    sb s0, 34(sp)
+; RV32I-NEXT:    sb t6, 33(sp)
+; RV32I-NEXT:    sb t5, 32(sp)
+; RV32I-NEXT:    sb t4, 31(sp)
+; RV32I-NEXT:    sb t3, 30(sp)
+; RV32I-NEXT:    sb t2, 29(sp)
+; RV32I-NEXT:    sb t1, 28(sp)
+; RV32I-NEXT:    sb t0, 27(sp)
+; RV32I-NEXT:    sb a7, 26(sp)
+; RV32I-NEXT:    sb a6, 25(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 24
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 13(a3)
+; RV32I-NEXT:    lbu a4, 14(a3)
+; RV32I-NEXT:    lbu a5, 15(a3)
+; RV32I-NEXT:    lbu a6, 4(a3)
+; RV32I-NEXT:    lbu a7, 5(a3)
+; RV32I-NEXT:    lbu t0, 6(a3)
+; RV32I-NEXT:    lbu t1, 7(a3)
 ; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 10(a3)
-; RV32I-NEXT:    lbu a3, 11(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    lbu t3, 9(a3)
+; RV32I-NEXT:    lbu t4, 10(a3)
+; RV32I-NEXT:    lbu t5, 12(a3)
+; RV32I-NEXT:    lbu t6, 11(a3)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    lbu s0, 0(a3)
+; RV32I-NEXT:    lbu s1, 1(a3)
+; RV32I-NEXT:    lbu s2, 2(a3)
+; RV32I-NEXT:    lbu a3, 3(a3)
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    sll a7, a6, a0
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl a7, t1, a7
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a3, a3, a4
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    not t1, a4
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    srli t0, a3, 1
+; RV32I-NEXT:    xori t1, a0, 31
+; RV32I-NEXT:    srl t0, t0, t1
+; RV32I-NEXT:    or t0, a7, t0
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    sll a1, a1, a0
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or a4, t3, t2
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t6, t4
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    srli a5, a4, 1
 ; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, a3, a5
-; RV32I-NEXT:    sll a4, a6, a4
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    srli a3, t0, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, t0, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    sll a4, a4, a0
+; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    not t1, a0
+; RV32I-NEXT:    srl a6, a6, t1
+; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    sll a0, a3, a0
+; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 11(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, a7, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1155,47 +1159,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
 ; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
 ; RV64I-NEXT:    lbu t0, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a5
@@ -1208,25 +1212,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
 ; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t3
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
@@ -1277,157 +1281,157 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 11(a0)
-; RV32I-NEXT:    lbu s4, 12(a0)
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 0(a1)
+; RV32I-NEXT:    lbu s3, 1(a1)
+; RV32I-NEXT:    lbu s4, 15(a0)
+; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu a0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu a0, 14(a0)
-; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s3, s2
+; RV32I-NEXT:    slli s3, s4, 24
+; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s2
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sb a3, 23(sp)
-; RV32I-NEXT:    sb a0, 22(sp)
-; RV32I-NEXT:    sb s5, 21(sp)
-; RV32I-NEXT:    sb s4, 20(sp)
-; RV32I-NEXT:    sb s3, 19(sp)
-; RV32I-NEXT:    sb s0, 18(sp)
-; RV32I-NEXT:    sb t6, 17(sp)
-; RV32I-NEXT:    sb t5, 16(sp)
-; RV32I-NEXT:    sb t4, 15(sp)
-; RV32I-NEXT:    sb t3, 14(sp)
-; RV32I-NEXT:    sb t2, 13(sp)
-; RV32I-NEXT:    sb t1, 12(sp)
-; RV32I-NEXT:    sb t0, 11(sp)
-; RV32I-NEXT:    sb a7, 10(sp)
-; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 36(sp)
-; RV32I-NEXT:    sb a4, 32(sp)
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 39(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 38(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb a3, 34(sp)
-; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, s2
+; RV32I-NEXT:    sb s4, 23(sp)
+; RV32I-NEXT:    sb s5, 22(sp)
+; RV32I-NEXT:    sb s1, 21(sp)
+; RV32I-NEXT:    sb s0, 20(sp)
+; RV32I-NEXT:    sb t6, 19(sp)
+; RV32I-NEXT:    sb t5, 18(sp)
+; RV32I-NEXT:    sb t4, 17(sp)
+; RV32I-NEXT:    sb t3, 16(sp)
+; RV32I-NEXT:    sb t2, 15(sp)
+; RV32I-NEXT:    sb t1, 14(sp)
+; RV32I-NEXT:    sb t0, 13(sp)
+; RV32I-NEXT:    sb a7, 12(sp)
+; RV32I-NEXT:    sb a6, 11(sp)
+; RV32I-NEXT:    sb a5, 10(sp)
+; RV32I-NEXT:    sb a4, 9(sp)
+; RV32I-NEXT:    sb a3, 8(sp)
+; RV32I-NEXT:    srai a1, s3, 31
+; RV32I-NEXT:    sb a1, 36(sp)
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    sb a1, 24(sp)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 39(sp)
+; RV32I-NEXT:    srli a4, a1, 16
+; RV32I-NEXT:    sb a4, 38(sp)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 37(sp)
+; RV32I-NEXT:    sb a3, 35(sp)
+; RV32I-NEXT:    sb a4, 34(sp)
+; RV32I-NEXT:    sb a1, 33(sp)
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a1, 25(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 8
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 4(a1)
+; RV32I-NEXT:    lbu a4, 5(a1)
+; RV32I-NEXT:    lbu a5, 6(a1)
+; RV32I-NEXT:    lbu a6, 7(a1)
+; RV32I-NEXT:    lbu a7, 8(a1)
+; RV32I-NEXT:    lbu t0, 9(a1)
+; RV32I-NEXT:    lbu t1, 10(a1)
+; RV32I-NEXT:    lbu t2, 11(a1)
+; RV32I-NEXT:    lbu t3, 12(a1)
+; RV32I-NEXT:    lbu t4, 13(a1)
+; RV32I-NEXT:    lbu t5, 14(a1)
+; RV32I-NEXT:    lbu t6, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu s0, 0(a1)
+; RV32I-NEXT:    lbu s1, 1(a1)
+; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl a4, a3, a0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a5, t0, a7
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sra a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    not a7, a0
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    srl a1, a1, a0
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    xori a7, a0, 31
+; RV32I-NEXT:    sll a3, a3, a7
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    sll a7, t1, a7
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    sra a0, t0, a0
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli t0, a5, 16
+; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a5, 14(a2)
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    sb a5, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    srli a0, a7, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
@@ -1460,18 +1464,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu s2, 29(a0)
+; RV64I-NEXT:    lbu s4, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a5, a1, a3
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1480,69 +1509,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    lbu s0, 12(a0)
 ; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu s11, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    lbu a6, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 87(sp)
-; RV64I-NEXT:    sb a3, 86(sp)
-; RV64I-NEXT:    sb a4, 85(sp)
+; RV64I-NEXT:    sb s6, 87(sp)
+; RV64I-NEXT:    sb s4, 86(sp)
+; RV64I-NEXT:    sb s2, 85(sp)
 ; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a5, 83(sp)
-; RV64I-NEXT:    sb a6, 82(sp)
-; RV64I-NEXT:    sb a7, 81(sp)
-; RV64I-NEXT:    sb s11, 80(sp)
-; RV64I-NEXT:    sb s10, 79(sp)
-; RV64I-NEXT:    sb ra, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
+; RV64I-NEXT:    sb a1, 83(sp)
+; RV64I-NEXT:    sb a3, 82(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1575,6 +1562,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 90(sp)
 ; RV64I-NEXT:    sb zero, 89(sp)
 ; RV64I-NEXT:    sb zero, 88(sp)
+; RV64I-NEXT:    sb a4, 81(sp)
+; RV64I-NEXT:    sb a6, 80(sp)
+; RV64I-NEXT:    sb a7, 79(sp)
+; RV64I-NEXT:    sb t0, 78(sp)
+; RV64I-NEXT:    sb ra, 77(sp)
+; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    sb s10, 75(sp)
+; RV64I-NEXT:    sb s9, 74(sp)
+; RV64I-NEXT:    sb s8, 73(sp)
+; RV64I-NEXT:    sb s7, 72(sp)
+; RV64I-NEXT:    sb s5, 71(sp)
+; RV64I-NEXT:    sb s3, 70(sp)
+; RV64I-NEXT:    sb s1, 69(sp)
+; RV64I-NEXT:    sb s0, 68(sp)
+; RV64I-NEXT:    sb t6, 67(sp)
+; RV64I-NEXT:    sb t5, 66(sp)
+; RV64I-NEXT:    sb t4, 65(sp)
 ; RV64I-NEXT:    sb t3, 64(sp)
 ; RV64I-NEXT:    sb t2, 63(sp)
 ; RV64I-NEXT:    sb t1, 62(sp)
@@ -1590,111 +1594,112 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 57(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    slli a0, a5, 56
+; RV64I-NEXT:    mv ra, a5
 ; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a3, sp, 56
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    lbu a0, 9(a3)
-; RV64I-NEXT:    lbu a1, 8(a3)
-; RV64I-NEXT:    lbu a4, 10(a3)
-; RV64I-NEXT:    lbu a5, 11(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a1, 13(a3)
-; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a5, 14(a3)
-; RV64I-NEXT:    lbu a6, 15(a3)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    addi a1, sp, 56
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu t3, 16(a0)
+; RV64I-NEXT:    lbu t4, 17(a0)
+; RV64I-NEXT:    lbu t5, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu s0, 20(a0)
+; RV64I-NEXT:    lbu s1, 21(a0)
+; RV64I-NEXT:    lbu a1, 22(a0)
+; RV64I-NEXT:    lbu s2, 23(a0)
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s4, 25(a0)
+; RV64I-NEXT:    lbu s5, 26(a0)
+; RV64I-NEXT:    lbu s6, 27(a0)
+; RV64I-NEXT:    lbu s7, 28(a0)
+; RV64I-NEXT:    lbu s8, 29(a0)
+; RV64I-NEXT:    lbu s9, 30(a0)
+; RV64I-NEXT:    lbu s10, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    lbu s11, 7(a0)
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a4, a1, a0
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a0, 17(a3)
-; RV64I-NEXT:    lbu a5, 16(a3)
-; RV64I-NEXT:    lbu a6, 18(a3)
-; RV64I-NEXT:    lbu a7, 19(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a3)
-; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu a7, 22(a3)
-; RV64I-NEXT:    lbu t0, 23(a3)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    andi a3, ra, 7
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or a5, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a1
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a3)
-; RV64I-NEXT:    lbu a7, 0(a3)
-; RV64I-NEXT:    lbu t0, 2(a3)
-; RV64I-NEXT:    lbu t1, 3(a3)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    or a1, s2, a1
+; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    slli a1, a5, 1
+; RV64I-NEXT:    not a6, a3
+; RV64I-NEXT:    sll a1, a1, a6
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a3)
-; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t1, 6(a3)
-; RV64I-NEXT:    lbu t2, 7(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a3)
-; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t1, 26(a3)
-; RV64I-NEXT:    lbu t2, 27(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    lbu t1, 28(a3)
-; RV64I-NEXT:    lbu t2, 30(a3)
-; RV64I-NEXT:    lbu a3, 31(a3)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a3, t2
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or a0, s11, a0
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a6, a0, a6
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s4, s3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or a7, s6, s5
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or a7, s8, s7
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or t0, s10, s9
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a3, a3, t0
-; RV64I-NEXT:    xori t0, a1, 63
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    xori t0, a3, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a7, a3, a7
-; RV64I-NEXT:    slli a3, a7, 1
-; RV64I-NEXT:    sll t0, a3, t0
-; RV64I-NEXT:    srl a3, a4, a1
-; RV64I-NEXT:    srl a4, a6, a1
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    srl a1, a7, a1
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a0
+; RV64I-NEXT:    slli a0, a7, 1
+; RV64I-NEXT:    sll t0, a0, t0
+; RV64I-NEXT:    srl a0, a4, a3
+; RV64I-NEXT:    srl a4, a6, a3
+; RV64I-NEXT:    srl a5, a5, a3
+; RV64I-NEXT:    srl a3, a7, a3
 ; RV64I-NEXT:    srli a6, a5, 48
 ; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    srli a6, a5, 40
@@ -1709,55 +1714,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    srli a5, a3, 56
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    srli a5, a3, 40
 ; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    srli a5, a3, 32
 ; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    srli a5, a3, 24
 ; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    srli a5, a3, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    srli a1, a4, 48
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a4, 40
-; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a4, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a4, 24
-; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a4, 16
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    or a1, a4, t1
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    or a3, a4, t1
 ; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a3, 32
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a3, 24
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a3, 16
+; RV64I-NEXT:    srli a4, a0, 16
 ; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    srli a3, a6, 56
-; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 7(a2)
 ; RV64I-NEXT:    srli a1, a1, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
+; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1790,18 +1795,32 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a4, 31(a0)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a1, a5
+; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1816,44 +1835,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s5, 17(a0)
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 59(sp)
-; RV32I-NEXT:    sb a3, 58(sp)
-; RV32I-NEXT:    sb a4, 57(sp)
+; RV32I-NEXT:    sb a4, 59(sp)
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb ra, 57(sp)
 ; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a5, 55(sp)
-; RV32I-NEXT:    sb a6, 54(sp)
-; RV32I-NEXT:    sb a7, 53(sp)
-; RV32I-NEXT:    sb s10, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
+; RV32I-NEXT:    sb a1, 55(sp)
+; RV32I-NEXT:    sb a3, 54(sp)
 ; RV32I-NEXT:    sb zero, 91(sp)
 ; RV32I-NEXT:    sb zero, 90(sp)
 ; RV32I-NEXT:    sb zero, 89(sp)
@@ -1886,6 +1882,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 62(sp)
 ; RV32I-NEXT:    sb zero, 61(sp)
 ; RV32I-NEXT:    sb zero, 60(sp)
+; RV32I-NEXT:    sb a5, 53(sp)
+; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb s11, 51(sp)
+; RV32I-NEXT:    sb s10, 50(sp)
+; RV32I-NEXT:    sb s9, 49(sp)
+; RV32I-NEXT:    sb s8, 48(sp)
+; RV32I-NEXT:    sb s7, 47(sp)
+; RV32I-NEXT:    sb s6, 46(sp)
+; RV32I-NEXT:    sb s5, 45(sp)
+; RV32I-NEXT:    sb s4, 44(sp)
 ; RV32I-NEXT:    sb s3, 43(sp)
 ; RV32I-NEXT:    sb s2, 42(sp)
 ; RV32I-NEXT:    sb s1, 41(sp)
@@ -1896,187 +1902,194 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t3, 36(sp)
 ; RV32I-NEXT:    sb t2, 35(sp)
 ; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    slli a0, a6, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t0, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 8(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 9(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu t0, 20(a0)
+; RV32I-NEXT:    lbu s5, 21(a0)
+; RV32I-NEXT:    lbu s6, 22(a0)
+; RV32I-NEXT:    lbu s7, 23(a0)
+; RV32I-NEXT:    lbu s4, 24(a0)
+; RV32I-NEXT:    lbu s8, 25(a0)
+; RV32I-NEXT:    lbu s9, 26(a0)
+; RV32I-NEXT:    lbu s10, 27(a0)
+; RV32I-NEXT:    lbu s11, 28(a0)
+; RV32I-NEXT:    lbu a5, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a1, a4
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    lw a7, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t1, 1(a0)
+; RV32I-NEXT:    lbu t2, 0(a0)
+; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    or a7, a7, a1
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a1, t1, t2
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or a3, t6, t5
+; RV32I-NEXT:    or t1, a3, a0
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a0, s3, s2
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or a0, s5, t0
+; RV32I-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t2, a3, 7
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or a3, s7, s6
+; RV32I-NEXT:    slli t0, a7, 1
+; RV32I-NEXT:    or t3, a3, a0
+; RV32I-NEXT:    not t4, t2
+; RV32I-NEXT:    sll a0, t0, t4
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or t0, s8, s4
+; RV32I-NEXT:    slli t5, a4, 1
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or t6, s10, s9
+; RV32I-NEXT:    slli a3, s0, 1
+; RV32I-NEXT:    sll a3, a3, t4
+; RV32I-NEXT:    or t6, t6, t0
+; RV32I-NEXT:    slli t0, t6, 1
+; RV32I-NEXT:    sll t4, t0, t4
+; RV32I-NEXT:    xori s1, t2, 31
+; RV32I-NEXT:    sll t0, t5, s1
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
+; RV32I-NEXT:    or a5, a5, s11
+; RV32I-NEXT:    slli t5, t1, 1
+; RV32I-NEXT:    sll t5, t5, s1
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a6, a6, ra
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    srl a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    sll s2, s2, s1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    sll a6, a6, s1
+; RV32I-NEXT:    srl a4, a4, t2
+; RV32I-NEXT:    srl a1, a1, t2
+; RV32I-NEXT:    srl t1, t1, t2
+; RV32I-NEXT:    srl a7, a7, t2
+; RV32I-NEXT:    srl t3, t3, t2
+; RV32I-NEXT:    srl s0, s0, t2
+; RV32I-NEXT:    srl t6, t6, t2
+; RV32I-NEXT:    srl a5, a5, t2
+; RV32I-NEXT:    srli t2, t6, 16
+; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    or a6, t6, a6
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srli t2, t6, 8
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    sb t2, 31(a2)
+; RV32I-NEXT:    srli t2, a5, 16
+; RV32I-NEXT:    sb t2, 30(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, s0, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    or a5, s0, s2
+; RV32I-NEXT:    sb s0, 16(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 17(a2)
+; RV32I-NEXT:    srli t2, t3, 16
+; RV32I-NEXT:    sb t2, 22(a2)
+; RV32I-NEXT:    or t2, t3, t4
 ; RV32I-NEXT:    sb t3, 20(a2)
 ; RV32I-NEXT:    srli t3, t3, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
+; RV32I-NEXT:    srli t3, a7, 16
 ; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
+; RV32I-NEXT:    or t3, a7, t5
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    srli a7, t1, 16
+; RV32I-NEXT:    sb a7, 14(a2)
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    srli a7, t1, 8
 ; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
+; RV32I-NEXT:    srli a7, a1, 16
 ; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    or a7, a1, t0
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a1, t2, 24
+; RV32I-NEXT:    sb a1, 23(a2)
+; RV32I-NEXT:    srli a1, t3, 24
+; RV32I-NEXT:    sb a1, 11(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
 ; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    srli a1, a7, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
@@ -2118,18 +2131,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu s2, 29(a0)
+; RV64I-NEXT:    lbu s4, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or s11, a1, a3
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -2138,70 +2176,28 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    lbu s0, 12(a0)
 ; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu a6, 23(a0)
+; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    sb a4, 117(sp)
+; RV64I-NEXT:    sb s6, 119(sp)
+; RV64I-NEXT:    sb s4, 118(sp)
+; RV64I-NEXT:    sb s2, 117(sp)
 ; RV64I-NEXT:    sb a0, 116(sp)
-; RV64I-NEXT:    sb a5, 115(sp)
-; RV64I-NEXT:    sb a6, 114(sp)
-; RV64I-NEXT:    sb a7, 113(sp)
-; RV64I-NEXT:    sb s11, 112(sp)
-; RV64I-NEXT:    sb s10, 111(sp)
-; RV64I-NEXT:    sb ra, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
+; RV64I-NEXT:    sb a1, 115(sp)
+; RV64I-NEXT:    sb a3, 114(sp)
+; RV64I-NEXT:    sb a4, 113(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -2234,6 +2230,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 58(sp)
 ; RV64I-NEXT:    sb zero, 57(sp)
 ; RV64I-NEXT:    sb zero, 56(sp)
+; RV64I-NEXT:    sb a5, 112(sp)
+; RV64I-NEXT:    sb a6, 111(sp)
+; RV64I-NEXT:    sb a7, 110(sp)
+; RV64I-NEXT:    sb ra, 109(sp)
+; RV64I-NEXT:    sb t0, 108(sp)
+; RV64I-NEXT:    sb s10, 107(sp)
+; RV64I-NEXT:    sb s9, 106(sp)
+; RV64I-NEXT:    sb s8, 105(sp)
+; RV64I-NEXT:    sb s7, 104(sp)
+; RV64I-NEXT:    sb s5, 103(sp)
+; RV64I-NEXT:    sb s3, 102(sp)
+; RV64I-NEXT:    sb s1, 101(sp)
+; RV64I-NEXT:    sb s0, 100(sp)
+; RV64I-NEXT:    sb t6, 99(sp)
+; RV64I-NEXT:    sb t5, 98(sp)
+; RV64I-NEXT:    sb t4, 97(sp)
+; RV64I-NEXT:    sb t3, 96(sp)
 ; RV64I-NEXT:    sb t2, 95(sp)
 ; RV64I-NEXT:    sb t1, 94(sp)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
@@ -2248,173 +2261,173 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 89(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    slli a0, s11, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 88
 ; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    lbu a3, 8(a0)
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a5, 11(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a5, 27(a0)
+; RV64I-NEXT:    lbu a6, 28(a0)
+; RV64I-NEXT:    lbu a1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s3, 9(a0)
+; RV64I-NEXT:    lbu s4, 10(a0)
+; RV64I-NEXT:    lbu s5, 11(a0)
+; RV64I-NEXT:    lbu s6, 12(a0)
+; RV64I-NEXT:    lbu s7, 13(a0)
+; RV64I-NEXT:    lbu s8, 14(a0)
+; RV64I-NEXT:    lbu s9, 15(a0)
+; RV64I-NEXT:    lbu t1, 16(a0)
+; RV64I-NEXT:    lbu t2, 17(a0)
+; RV64I-NEXT:    lbu t3, 18(a0)
+; RV64I-NEXT:    lbu t5, 19(a0)
+; RV64I-NEXT:    lbu t4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s5, s5, 24
+; RV64I-NEXT:    or s3, s5, s4
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s3, s7, s6
+; RV64I-NEXT:    slli s8, s8, 16
+; RV64I-NEXT:    slli s9, s9, 24
+; RV64I-NEXT:    or s4, s9, s8
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli s3, s3, 32
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 0(a0)
+; RV64I-NEXT:    lbu s4, 1(a0)
+; RV64I-NEXT:    lbu s5, 2(a0)
+; RV64I-NEXT:    lbu s6, 3(a0)
+; RV64I-NEXT:    lbu s7, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or s4, s8, s7
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or s5, s10, s9
+; RV64I-NEXT:    or s4, s5, s4
+; RV64I-NEXT:    lbu a0, 24(a0)
+; RV64I-NEXT:    slli s4, s4, 32
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a5, a5, 24
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a3, 13(a0)
-; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu a6, 15(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a3, a1
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a1, a0
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    or a1, t5, t3
+; RV64I-NEXT:    or a1, a1, a0
+; RV64I-NEXT:    andi a4, s11, 7
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    or a5, t6, t4
+; RV64I-NEXT:    srli a0, s3, 1
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    xori a6, a4, 63
+; RV64I-NEXT:    srl a0, a0, a6
+; RV64I-NEXT:    or a5, s0, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t0, 27(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t0, 30(a0)
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t0, 18(a0)
-; RV64I-NEXT:    lbu t1, 19(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    lbu t0, 20(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    srli t0, a4, 1
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or t1, a0, t1
-; RV64I-NEXT:    xori t2, a1, 63
-; RV64I-NEXT:    srl a0, t0, t2
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    srli a7, a6, 1
-; RV64I-NEXT:    srl a7, a7, t2
-; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    not t1, a1
-; RV64I-NEXT:    srl t0, t0, t1
-; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    sll a5, a5, a1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    sll a1, a4, a1
-; RV64I-NEXT:    srli a4, a6, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a6, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a6, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a6, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a6, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    or a4, a6, t0
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 30(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 27(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    or a6, a5, a7
+; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    srli a1, a5, 1
+; RV64I-NEXT:    srl a6, a1, a6
+; RV64I-NEXT:    srli a1, s2, 1
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    srl a7, a1, a7
+; RV64I-NEXT:    sll a1, s2, a4
+; RV64I-NEXT:    sll a3, a3, a4
+; RV64I-NEXT:    sll a5, a5, a4
+; RV64I-NEXT:    sll a4, s3, a4
+; RV64I-NEXT:    srli t0, a5, 56
+; RV64I-NEXT:    sb t0, 23(a2)
+; RV64I-NEXT:    srli t0, a5, 48
+; RV64I-NEXT:    sb t0, 22(a2)
+; RV64I-NEXT:    srli t0, a5, 40
+; RV64I-NEXT:    sb t0, 21(a2)
+; RV64I-NEXT:    srli t0, a5, 32
+; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a5, 24
+; RV64I-NEXT:    sb t0, 19(a2)
+; RV64I-NEXT:    srli t0, a5, 16
+; RV64I-NEXT:    sb t0, 18(a2)
+; RV64I-NEXT:    or a7, a5, a7
 ; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 6(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 2(a2)
-; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    srli a1, a3, 56
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a3, 48
-; RV64I-NEXT:    sb a1, 14(a2)
-; RV64I-NEXT:    srli a1, a3, 40
-; RV64I-NEXT:    sb a1, 13(a2)
-; RV64I-NEXT:    srli a1, a3, 32
-; RV64I-NEXT:    sb a1, 12(a2)
-; RV64I-NEXT:    srli a1, a3, 24
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    srli a1, a3, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    or a5, a3, a6
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a6, 24(a2)
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a4, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a7, 16(a2)
+; RV64I-NEXT:    sb a5, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -2448,18 +2461,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a4, 31(a0)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a1, a5
+; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -2474,44 +2501,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s5, 17(a0)
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    sb a4, 89(sp)
+; RV32I-NEXT:    sb a4, 91(sp)
+; RV32I-NEXT:    sb t0, 90(sp)
+; RV32I-NEXT:    sb ra, 89(sp)
 ; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a5, 87(sp)
-; RV32I-NEXT:    sb a6, 86(sp)
-; RV32I-NEXT:    sb a7, 85(sp)
-; RV32I-NEXT:    sb s10, 84(sp)
-; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s11, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
+; RV32I-NEXT:    sb a1, 87(sp)
+; RV32I-NEXT:    sb a3, 86(sp)
 ; RV32I-NEXT:    sb zero, 59(sp)
 ; RV32I-NEXT:    sb zero, 58(sp)
 ; RV32I-NEXT:    sb zero, 57(sp)
@@ -2544,6 +2548,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb a5, 85(sp)
+; RV32I-NEXT:    sb a7, 84(sp)
+; RV32I-NEXT:    sb s11, 83(sp)
+; RV32I-NEXT:    sb s10, 82(sp)
+; RV32I-NEXT:    sb s9, 81(sp)
+; RV32I-NEXT:    sb s8, 80(sp)
+; RV32I-NEXT:    sb s7, 79(sp)
+; RV32I-NEXT:    sb s6, 78(sp)
+; RV32I-NEXT:    sb s5, 77(sp)
+; RV32I-NEXT:    sb s4, 76(sp)
 ; RV32I-NEXT:    sb s3, 75(sp)
 ; RV32I-NEXT:    sb s2, 74(sp)
 ; RV32I-NEXT:    sb s1, 73(sp)
@@ -2554,189 +2568,192 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t3, 68(sp)
 ; RV32I-NEXT:    sb t2, 67(sp)
 ; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    slli a0, a6, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 60
-; RV32I-NEXT:    sub a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a1, t0, 7
-; RV32I-NEXT:    lbu a0, 1(a4)
-; RV32I-NEXT:    lbu a3, 0(a4)
-; RV32I-NEXT:    lbu a5, 2(a4)
-; RV32I-NEXT:    lbu a6, 3(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    addi a1, sp, 60
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    lbu a1, 8(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
+; RV32I-NEXT:    lbu a7, 16(a0)
+; RV32I-NEXT:    lbu t6, 17(a0)
+; RV32I-NEXT:    lbu t5, 18(a0)
+; RV32I-NEXT:    lbu s1, 29(a0)
+; RV32I-NEXT:    lbu s0, 30(a0)
+; RV32I-NEXT:    lbu s2, 31(a0)
+; RV32I-NEXT:    lbu s7, 21(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu s9, 23(a0)
+; RV32I-NEXT:    lbu s3, 24(a0)
+; RV32I-NEXT:    lbu s5, 25(a0)
+; RV32I-NEXT:    lbu s4, 26(a0)
+; RV32I-NEXT:    lbu s6, 27(a0)
+; RV32I-NEXT:    lbu s10, 19(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu ra, 3(a0)
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a6, a5
-; RV32I-NEXT:    or a6, a3, a0
-; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori a7, a1, 31
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    lbu a3, 13(a4)
-; RV32I-NEXT:    lbu a5, 12(a4)
-; RV32I-NEXT:    lbu t0, 14(a4)
-; RV32I-NEXT:    lbu t1, 15(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t1, t0
-; RV32I-NEXT:    or t0, a5, a3
-; RV32I-NEXT:    lbu a3, 9(a4)
-; RV32I-NEXT:    lbu a5, 8(a4)
-; RV32I-NEXT:    lbu t1, 10(a4)
-; RV32I-NEXT:    lbu t2, 11(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    or a1, ra, a1
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a1, t4, a5
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or a5, t3, t2
+; RV32I-NEXT:    or a6, a5, a1
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    lbu a5, 20(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, t2, t1
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    srli a3, t1, 1
-; RV32I-NEXT:    srl a5, a3, a7
-; RV32I-NEXT:    srli t4, t5, 1
-; RV32I-NEXT:    not t2, a1
-; RV32I-NEXT:    lbu a3, 21(a4)
-; RV32I-NEXT:    lbu t3, 20(a4)
-; RV32I-NEXT:    lbu t6, 22(a4)
-; RV32I-NEXT:    lbu s0, 23(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t3, t3, a3
-; RV32I-NEXT:    lbu a3, 17(a4)
-; RV32I-NEXT:    lbu t6, 16(a4)
-; RV32I-NEXT:    lbu s0, 18(a4)
-; RV32I-NEXT:    lbu s1, 19(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or t0, s11, t1
+; RV32I-NEXT:    or t0, t0, a1
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or a1, s7, a5
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    or a5, s9, s8
+; RV32I-NEXT:    or t1, a5, a1
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or a1, t6, a7
+; RV32I-NEXT:    lbu a5, 28(a0)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi a7, a0, 7
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or a0, s10, t5
+; RV32I-NEXT:    srli t2, a3, 1
+; RV32I-NEXT:    or t3, a0, a1
+; RV32I-NEXT:    xori t4, a7, 31
+; RV32I-NEXT:    srl a0, t2, t4
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or a5, s1, a5
+; RV32I-NEXT:    srli a1, t0, 1
+; RV32I-NEXT:    srl a1, a1, t4
 ; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a3
-; RV32I-NEXT:    lbu a3, 29(a4)
-; RV32I-NEXT:    lbu t6, 28(a4)
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu s2, 31(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    lbu s1, 25(a4)
-; RV32I-NEXT:    lbu s2, 24(a4)
-; RV32I-NEXT:    srl t4, t4, t2
-; RV32I-NEXT:    or t6, t6, a3
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s1, s2
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu a4, 27(a4)
-; RV32I-NEXT:    srli s2, s0, 1
-; RV32I-NEXT:    srl s2, s2, a7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    srl s1, s1, t2
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    srl a7, a3, a7
-; RV32I-NEXT:    srli a3, t3, 1
-; RV32I-NEXT:    srl t2, a3, t2
-; RV32I-NEXT:    sll a3, t5, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t3, t3, a1
-; RV32I-NEXT:    sll t5, s0, a1
-; RV32I-NEXT:    sll t6, t6, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll a1, a6, a1
-; RV32I-NEXT:    srli a6, a4, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a4, t2
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t6, 24
-; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t6, 16
-; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    or a4, t6, a7
-; RV32I-NEXT:    srli a7, t6, 8
-; RV32I-NEXT:    sb a7, 29(a2)
+; RV32I-NEXT:    or t2, s2, s0
+; RV32I-NEXT:    srli t5, a4, 1
+; RV32I-NEXT:    or t2, t2, a5
+; RV32I-NEXT:    not t6, a7
+; RV32I-NEXT:    srl a5, t5, t6
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t5, s5, s3
+; RV32I-NEXT:    srli s0, t3, 1
+; RV32I-NEXT:    srl s0, s0, t4
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or s1, s6, s4
+; RV32I-NEXT:    srli s2, a6, 1
+; RV32I-NEXT:    srl s2, s2, t6
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    srli s1, t5, 1
+; RV32I-NEXT:    srl t4, s1, t4
+; RV32I-NEXT:    srli s1, t1, 1
+; RV32I-NEXT:    srl t6, s1, t6
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    sll t0, t0, a7
+; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    sll t2, t2, a7
+; RV32I-NEXT:    sll t5, t5, a7
+; RV32I-NEXT:    sll a3, a3, a7
 ; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 19(a2)
+; RV32I-NEXT:    sb a7, 27(a2)
 ; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    or a7, t5, s1
-; RV32I-NEXT:    srli t2, t5, 8
-; RV32I-NEXT:    sb t2, 17(a2)
+; RV32I-NEXT:    sb a7, 26(a2)
+; RV32I-NEXT:    or a7, t5, t6
+; RV32I-NEXT:    srli t5, t5, 8
+; RV32I-NEXT:    sb t5, 25(a2)
+; RV32I-NEXT:    srli t5, t2, 24
+; RV32I-NEXT:    sb t5, 31(a2)
+; RV32I-NEXT:    srli t5, t2, 16
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    or t4, t2, t4
+; RV32I-NEXT:    srli t2, t2, 8
+; RV32I-NEXT:    sb t2, 29(a2)
 ; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 23(a2)
+; RV32I-NEXT:    sb t2, 19(a2)
 ; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 22(a2)
+; RV32I-NEXT:    sb t2, 18(a2)
 ; RV32I-NEXT:    or t2, t3, s2
 ; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
+; RV32I-NEXT:    sb t3, 17(a2)
 ; RV32I-NEXT:    srli t3, t1, 24
-; RV32I-NEXT:    sb t3, 11(a2)
+; RV32I-NEXT:    sb t3, 23(a2)
 ; RV32I-NEXT:    srli t3, t1, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, t1, t4
+; RV32I-NEXT:    sb t3, 22(a2)
+; RV32I-NEXT:    or t3, t1, s0
 ; RV32I-NEXT:    srli t1, t1, 8
-; RV32I-NEXT:    sb t1, 9(a2)
+; RV32I-NEXT:    sb t1, 21(a2)
 ; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    sb t1, 11(a2)
 ; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    sb t1, 14(a2)
+; RV32I-NEXT:    sb t1, 10(a2)
 ; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    srli t0, a1, 16
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    srli t0, a6, 24
+; RV32I-NEXT:    sb t0, 15(a2)
+; RV32I-NEXT:    srli t0, a6, 16
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 13(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t2, 20(a2)
-; RV32I-NEXT:    sb t3, 8(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t4, 28(a2)
+; RV32I-NEXT:    sb t2, 16(a2)
+; RV32I-NEXT:    sb t3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
@@ -2776,106 +2793,107 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a3, 28(a0)
 ; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    lbu a3, 29(a0)
 ; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    lbu t6, 31(a0)
+; RV64I-NEXT:    lbu a3, 30(a0)
 ; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t3, 6(a0)
-; RV64I-NEXT:    lbu t4, 7(a0)
-; RV64I-NEXT:    lbu t5, 8(a0)
-; RV64I-NEXT:    lbu t6, 9(a0)
-; RV64I-NEXT:    lbu s0, 10(a0)
-; RV64I-NEXT:    lbu s1, 11(a0)
-; RV64I-NEXT:    lbu s2, 12(a0)
-; RV64I-NEXT:    lbu s3, 13(a0)
-; RV64I-NEXT:    lbu s4, 14(a0)
-; RV64I-NEXT:    lbu s5, 15(a0)
-; RV64I-NEXT:    lbu s6, 16(a0)
-; RV64I-NEXT:    lbu s7, 17(a0)
-; RV64I-NEXT:    lbu s8, 18(a0)
-; RV64I-NEXT:    lbu s9, 19(a0)
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or a3, s11, a3
-; RV64I-NEXT:    lbu s11, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    lbu t3, 4(a1)
+; RV64I-NEXT:    lbu t4, 5(a1)
+; RV64I-NEXT:    lbu t5, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s11
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    slli ra, ra, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    or a1, a1, t5
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t2, a1, a3
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a4, 28(a0)
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t2, 3(a0)
+; RV64I-NEXT:    lbu t3, 4(a0)
+; RV64I-NEXT:    lbu t4, 5(a0)
+; RV64I-NEXT:    lbu t5, 6(a0)
+; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    lbu s0, 8(a0)
+; RV64I-NEXT:    lbu s1, 9(a0)
+; RV64I-NEXT:    lbu s2, 10(a0)
+; RV64I-NEXT:    lbu s3, 11(a0)
+; RV64I-NEXT:    lbu s4, 12(a0)
+; RV64I-NEXT:    lbu s5, 13(a0)
+; RV64I-NEXT:    lbu s6, 14(a0)
+; RV64I-NEXT:    lbu s7, 15(a0)
+; RV64I-NEXT:    lbu s8, 16(a0)
+; RV64I-NEXT:    lbu s9, 17(a0)
+; RV64I-NEXT:    lbu s10, 18(a0)
+; RV64I-NEXT:    lbu s11, 19(a0)
+; RV64I-NEXT:    lbu ra, 20(a0)
+; RV64I-NEXT:    lbu a7, 21(a0)
+; RV64I-NEXT:    lbu a6, 22(a0)
+; RV64I-NEXT:    lbu a5, 23(a0)
+; RV64I-NEXT:    lbu a4, 24(a0)
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a1, 26(a0)
 ; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a4, 84(sp)
+; RV64I-NEXT:    ld t1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t1, 86(sp)
+; RV64I-NEXT:    ld t1, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t1, 85(sp)
+; RV64I-NEXT:    ld t1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t1, 84(sp)
 ; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb t0, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
-; RV64I-NEXT:    sb s9, 75(sp)
-; RV64I-NEXT:    sb s8, 74(sp)
-; RV64I-NEXT:    sb s7, 73(sp)
-; RV64I-NEXT:    sb s6, 72(sp)
-; RV64I-NEXT:    sb s5, 71(sp)
-; RV64I-NEXT:    sb s4, 70(sp)
-; RV64I-NEXT:    sb s3, 69(sp)
-; RV64I-NEXT:    sb s2, 68(sp)
-; RV64I-NEXT:    sb s1, 67(sp)
-; RV64I-NEXT:    sb s0, 66(sp)
-; RV64I-NEXT:    sb t6, 65(sp)
-; RV64I-NEXT:    sb t5, 64(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb t4, 63(sp)
-; RV64I-NEXT:    sb t3, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 82(sp)
+; RV64I-NEXT:    sb a3, 81(sp)
+; RV64I-NEXT:    sb t6, 87(sp)
+; RV64I-NEXT:    slli t6, t6, 56
+; RV64I-NEXT:    sb a4, 80(sp)
+; RV64I-NEXT:    sb a5, 79(sp)
+; RV64I-NEXT:    sb a6, 78(sp)
+; RV64I-NEXT:    sb a7, 77(sp)
+; RV64I-NEXT:    sb ra, 76(sp)
+; RV64I-NEXT:    sb s11, 75(sp)
+; RV64I-NEXT:    sb s10, 74(sp)
+; RV64I-NEXT:    sb s9, 73(sp)
+; RV64I-NEXT:    sb s8, 72(sp)
+; RV64I-NEXT:    sb s7, 71(sp)
+; RV64I-NEXT:    sb s6, 70(sp)
+; RV64I-NEXT:    sb s5, 69(sp)
+; RV64I-NEXT:    sb s4, 68(sp)
+; RV64I-NEXT:    sb s3, 67(sp)
+; RV64I-NEXT:    sb s2, 66(sp)
+; RV64I-NEXT:    sb s1, 65(sp)
+; RV64I-NEXT:    sb s0, 64(sp)
+; RV64I-NEXT:    sb t0, 63(sp)
+; RV64I-NEXT:    sb t5, 62(sp)
+; RV64I-NEXT:    sb t4, 61(sp)
+; RV64I-NEXT:    sb t3, 60(sp)
+; RV64I-NEXT:    sb t2, 59(sp)
+; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
+; RV64I-NEXT:    srai a0, t6, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2915,108 +2933,111 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    slli a0, t2, 56
+; RV64I-NEXT:    ld a7, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    slli a0, a7, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    lbu a0, 9(a1)
-; RV64I-NEXT:    lbu a3, 8(a1)
-; RV64I-NEXT:    lbu a4, 10(a1)
-; RV64I-NEXT:    lbu a5, 11(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a3, 13(a1)
-; RV64I-NEXT:    lbu a4, 12(a1)
-; RV64I-NEXT:    lbu a5, 14(a1)
-; RV64I-NEXT:    lbu a6, 15(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu t3, 16(a0)
+; RV64I-NEXT:    lbu t4, 17(a0)
+; RV64I-NEXT:    lbu t5, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu s0, 20(a0)
+; RV64I-NEXT:    lbu s1, 21(a0)
+; RV64I-NEXT:    lbu s2, 22(a0)
+; RV64I-NEXT:    lbu s3, 23(a0)
+; RV64I-NEXT:    lbu s4, 24(a0)
+; RV64I-NEXT:    lbu s5, 25(a0)
+; RV64I-NEXT:    lbu s6, 26(a0)
+; RV64I-NEXT:    lbu s7, 27(a0)
+; RV64I-NEXT:    lbu s8, 28(a0)
+; RV64I-NEXT:    lbu s9, 29(a0)
+; RV64I-NEXT:    lbu s10, 30(a0)
+; RV64I-NEXT:    lbu s11, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    lbu ra, 7(a0)
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    ld a3, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    or a3, t0, a3
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a4, t2, t1
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a4, a3, a0
-; RV64I-NEXT:    andi a3, t2, 7
-; RV64I-NEXT:    lbu a0, 17(a1)
-; RV64I-NEXT:    lbu a5, 16(a1)
-; RV64I-NEXT:    lbu a6, 18(a1)
-; RV64I-NEXT:    lbu a7, 19(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a1)
-; RV64I-NEXT:    lbu a6, 20(a1)
-; RV64I-NEXT:    lbu a7, 22(a1)
-; RV64I-NEXT:    lbu t0, 23(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a4, a3, a1
+; RV64I-NEXT:    andi a3, a7, 7
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or a1, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a5, t6, t5
+; RV64I-NEXT:    or a1, a5, a1
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a5, s3, s2
+; RV64I-NEXT:    or a5, a5, s0
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    slli a1, a5, 1
 ; RV64I-NEXT:    not a6, a3
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    sll a1, a1, a6
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a1)
-; RV64I-NEXT:    lbu t0, 24(a1)
-; RV64I-NEXT:    lbu t1, 26(a1)
-; RV64I-NEXT:    lbu t2, 27(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a1)
-; RV64I-NEXT:    lbu t1, 28(a1)
-; RV64I-NEXT:    lbu t2, 30(a1)
-; RV64I-NEXT:    lbu a1, 31(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    slli ra, ra, 24
+; RV64I-NEXT:    or a0, ra, a0
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a6, a0, a6
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    or a0, s5, s4
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or a7, s7, s6
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or a7, s9, s8
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or t0, s11, s10
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    xori t0, a3, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll t0, a1, t0
-; RV64I-NEXT:    srl a1, a4, a3
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a0
+; RV64I-NEXT:    slli a0, a7, 1
+; RV64I-NEXT:    sll t0, a0, t0
+; RV64I-NEXT:    srl a0, a4, a3
 ; RV64I-NEXT:    srl a4, a6, a3
 ; RV64I-NEXT:    srl a5, a5, a3
 ; RV64I-NEXT:    sra a3, a7, a3
@@ -3063,26 +3084,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a1, 48
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a1, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a1, 32
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a1, 24
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a1, 16
+; RV64I-NEXT:    srli a4, a0, 16
 ; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a6, 56
-; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    sb a0, 23(a2)
 ; RV64I-NEXT:    srli a3, a3, 56
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -3115,94 +3136,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a7, 31(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu t1, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or a3, a3, s11
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    or t1, a1, a3
-; RV32I-NEXT:    lbu t0, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a4, 28(a0)
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu t5, 7(a0)
+; RV32I-NEXT:    lbu t6, 8(a0)
+; RV32I-NEXT:    lbu s0, 9(a0)
+; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s2, 11(a0)
+; RV32I-NEXT:    lbu s3, 12(a0)
+; RV32I-NEXT:    lbu s4, 13(a0)
+; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s7, 16(a0)
+; RV32I-NEXT:    lbu s8, 17(a0)
+; RV32I-NEXT:    lbu s9, 18(a0)
+; RV32I-NEXT:    lbu s10, 19(a0)
+; RV32I-NEXT:    lbu s11, 20(a0)
+; RV32I-NEXT:    lbu ra, 21(a0)
+; RV32I-NEXT:    lbu a6, 22(a0)
+; RV32I-NEXT:    lbu a5, 23(a0)
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu a1, 26(a0)
 ; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a4, 56(sp)
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb t2, 57(sp)
+; RV32I-NEXT:    sb t3, 56(sp)
 ; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb t0, 51(sp)
-; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
-; RV32I-NEXT:    sb s10, 48(sp)
-; RV32I-NEXT:    sb s9, 47(sp)
-; RV32I-NEXT:    sb s8, 46(sp)
-; RV32I-NEXT:    sb s7, 45(sp)
-; RV32I-NEXT:    sb s6, 44(sp)
-; RV32I-NEXT:    sb s5, 43(sp)
-; RV32I-NEXT:    sb t3, 59(sp)
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb s1, 39(sp)
-; RV32I-NEXT:    sb s0, 38(sp)
-; RV32I-NEXT:    sb t6, 37(sp)
-; RV32I-NEXT:    sb t5, 36(sp)
-; RV32I-NEXT:    sb t4, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
+; RV32I-NEXT:    sb a1, 54(sp)
+; RV32I-NEXT:    sb a3, 53(sp)
+; RV32I-NEXT:    sb a7, 59(sp)
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    sb a4, 52(sp)
+; RV32I-NEXT:    sb a5, 51(sp)
+; RV32I-NEXT:    sb a6, 50(sp)
+; RV32I-NEXT:    sb ra, 49(sp)
+; RV32I-NEXT:    sb s11, 48(sp)
+; RV32I-NEXT:    sb s10, 47(sp)
+; RV32I-NEXT:    sb s9, 46(sp)
+; RV32I-NEXT:    sb s8, 45(sp)
+; RV32I-NEXT:    sb s7, 44(sp)
+; RV32I-NEXT:    sb s6, 43(sp)
+; RV32I-NEXT:    sb s5, 42(sp)
+; RV32I-NEXT:    sb s4, 41(sp)
+; RV32I-NEXT:    sb s3, 40(sp)
+; RV32I-NEXT:    sb s2, 39(sp)
+; RV32I-NEXT:    sb s1, 38(sp)
+; RV32I-NEXT:    sb s0, 37(sp)
+; RV32I-NEXT:    sb t6, 36(sp)
+; RV32I-NEXT:    sb t5, 35(sp)
+; RV32I-NEXT:    sb t4, 34(sp)
+; RV32I-NEXT:    sb t1, 33(sp)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t3, 31
+; RV32I-NEXT:    srai a0, a7, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -3238,176 +3258,184 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 63(sp)
 ; RV32I-NEXT:    sb a3, 62(sp)
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    slli a0, t1, 24
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t1, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 8(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 9(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu s4, 20(a0)
+; RV32I-NEXT:    lbu s6, 21(a0)
+; RV32I-NEXT:    lbu s7, 22(a0)
+; RV32I-NEXT:    lbu s8, 23(a0)
+; RV32I-NEXT:    lbu s5, 24(a0)
+; RV32I-NEXT:    lbu s9, 25(a0)
+; RV32I-NEXT:    lbu s10, 26(a0)
+; RV32I-NEXT:    lbu s11, 27(a0)
+; RV32I-NEXT:    lbu ra, 28(a0)
+; RV32I-NEXT:    lbu a5, 29(a0)
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, t0, 16
+; RV32I-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    or a4, a1, a4
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    lw t0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    lbu t1, 1(a0)
+; RV32I-NEXT:    lbu t2, 0(a0)
+; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    or t0, t0, a1
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a1, t1, t2
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or a3, t6, t5
+; RV32I-NEXT:    or t2, a3, a0
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a0, s3, s2
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or a0, s6, s4
+; RV32I-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t3, a3, 7
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    or a3, s8, s7
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    or t4, a3, a0
+; RV32I-NEXT:    not a3, t3
+; RV32I-NEXT:    sll a0, t1, a3
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t5, s9, s5
+; RV32I-NEXT:    slli t6, a4, 1
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s1, s11, s10
+; RV32I-NEXT:    slli t1, s0, 1
+; RV32I-NEXT:    sll t1, t1, a3
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, a3
+; RV32I-NEXT:    xori s2, t3, 31
+; RV32I-NEXT:    sll a3, t6, s2
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    sra a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
+; RV32I-NEXT:    or a5, a5, ra
+; RV32I-NEXT:    slli t6, t2, 1
+; RV32I-NEXT:    sll t6, t6, s2
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a7, t4, 1
+; RV32I-NEXT:    sll a7, a7, s2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    sll a6, a6, s2
+; RV32I-NEXT:    srl a4, a4, t3
+; RV32I-NEXT:    srl a1, a1, t3
+; RV32I-NEXT:    srl t2, t2, t3
+; RV32I-NEXT:    srl t0, t0, t3
+; RV32I-NEXT:    srl t4, t4, t3
+; RV32I-NEXT:    srl s0, s0, t3
+; RV32I-NEXT:    srl t5, t5, t3
+; RV32I-NEXT:    sra a5, a5, t3
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    sb t3, 26(a2)
+; RV32I-NEXT:    or a6, t5, a6
 ; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
+; RV32I-NEXT:    srli t3, t5, 8
+; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a5, 24
+; RV32I-NEXT:    sb t3, 31(a2)
+; RV32I-NEXT:    srli t3, a5, 16
+; RV32I-NEXT:    sb t3, 30(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, s0, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    or a5, s0, a7
+; RV32I-NEXT:    sb s0, 16(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 17(a2)
+; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    sb a7, 22(a2)
+; RV32I-NEXT:    or a7, t4, s1
+; RV32I-NEXT:    sb t4, 20(a2)
+; RV32I-NEXT:    srli t3, t4, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
+; RV32I-NEXT:    srli t3, t0, 16
 ; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    or t3, t0, t6
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    srli t0, t2, 16
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    sb t2, 12(a2)
+; RV32I-NEXT:    srli t1, t2, 8
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 23(a2)
+; RV32I-NEXT:    srli a1, t3, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a1, t0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    sb a3, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
index 34900b3006915..95cb61ef05051 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
@@ -57,14 +57,14 @@ define i64 @lwud(i32* %a) {
 define i64 @ldd(i64* %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ldd:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    lw a1, 32(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a2, 36(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a3, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a1, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a2, 32(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a3, 36(a0)
 ; RV32XTHEADMEMPAIR-NEXT:    lw a0, 40(a0)
-; RV32XTHEADMEMPAIR-NEXT:    add a2, a2, a3
-; RV32XTHEADMEMPAIR-NEXT:    add a0, a1, a0
-; RV32XTHEADMEMPAIR-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a3, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a0
+; RV32XTHEADMEMPAIR-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ldd:

>From f61a84305764c8c4dab15065f46a1678f31b0d05 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 6 Dec 2023 14:03:24 +0000
Subject: [PATCH 6/6] Add parameter to allow reordering due to memop clustering

Reordering based on the sort order of the MemOpInfo array was disabled
in <https://reviews.llvm.org/D72706>, but it's not clear this is
desirable for all targets. It also makes it more difficult to compare
the incremental benefit of enabling load clustering in the selectiondag
scheduler as well as the machinescheduler, as the sdag scheduler does
seem to allow this reordering.
---
 llvm/include/llvm/CodeGen/MachineScheduler.h  |    4 +-
 llvm/lib/CodeGen/MachineScheduler.cpp         |   23 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |    2 +-
 llvm/test/CodeGen/RISCV/add-before-shl.ll     |   42 +-
 llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll |    8 +-
 llvm/test/CodeGen/RISCV/atomic-rmw.ll         |   80 +-
 llvm/test/CodeGen/RISCV/atomic-signext.ll     |   16 +-
 .../CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll |    8 +-
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll |   96 +-
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |   48 +-
 llvm/test/CodeGen/RISCV/forced-atomics.ll     |   22 +-
 llvm/test/CodeGen/RISCV/iabs.ll               |  104 +-
 .../test/CodeGen/RISCV/intrinsic-cttz-elts.ll |   16 +-
 llvm/test/CodeGen/RISCV/legalize-fneg.ll      |   14 +-
 llvm/test/CodeGen/RISCV/llvm.exp10.ll         |    8 +-
 llvm/test/CodeGen/RISCV/llvm.frexp.ll         |   76 +-
 llvm/test/CodeGen/RISCV/memcpy.ll             |   62 +-
 .../CodeGen/RISCV/misched-load-clustering.ll  |   10 +-
 llvm/test/CodeGen/RISCV/mul.ll                |  114 +-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |    2 +-
 .../test/CodeGen/RISCV/reduction-formation.ll |   72 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |  136 +-
 llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll    |   64 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-elen.ll   |   60 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll   |   96 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   38 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  | 1230 ++++++-------
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll | 1048 +++++------
 .../fixed-vectors-strided-load-store-asm.ll   |   16 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |  324 ++--
 llvm/test/CodeGen/RISCV/rvv/pr63596.ll        |   12 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  134 +-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |   26 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |  552 +++---
 .../RISCV/umulo-128-legalisation-lowering.ll  |   56 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |  400 ++--
 llvm/test/CodeGen/RISCV/vararg.ll             |   30 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 1570 ++++++++--------
 .../RISCV/wide-scalar-shift-legalization.ll   | 1604 ++++++++---------
 39 files changed, 4110 insertions(+), 4113 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 9f16cf5d5bc38..419d665dfbfc2 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1349,11 +1349,11 @@ ScheduleDAGMI *createGenericSchedPostRA(MachineSchedContext *C);
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI);
+                             const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false);
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI);
+                              const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false);
 
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index cd5fe71ef0c1a..a86da16edd8d3 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1743,11 +1743,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   bool IsLoad;
+  bool ReorderWhileClustering;
 
 public:
   BaseMemOpClusterMutation(const TargetInstrInfo *tii,
-                           const TargetRegisterInfo *tri, bool IsLoad)
-      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
+                           const TargetRegisterInfo *tri, bool IsLoad, bool ReorderWhileClustering)
+      : TII(tii), TRI(tri), IsLoad(IsLoad), ReorderWhileClustering(ReorderWhileClustering) {}
 
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
@@ -1763,14 +1764,14 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
 class StoreClusterMutation : public BaseMemOpClusterMutation {
 public:
   StoreClusterMutation(const TargetInstrInfo *tii,
-                       const TargetRegisterInfo *tri)
-      : BaseMemOpClusterMutation(tii, tri, false) {}
+                       const TargetRegisterInfo *tri, bool ReorderWhileClustering)
+      : BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {}
 };
 
 class LoadClusterMutation : public BaseMemOpClusterMutation {
 public:
-  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
-      : BaseMemOpClusterMutation(tii, tri, true) {}
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri, bool ReorderWhileClustering)
+      : BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {}
 };
 
 } // end anonymous namespace
@@ -1779,15 +1780,15 @@ namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
+                             const TargetRegisterInfo *TRI, bool ReorderWhileClustering) {
+  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
+                              const TargetRegisterInfo *TRI, bool ReorderWhileClustering) {
+  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
@@ -1840,7 +1841,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
 
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
-    if (SUa->NodeNum > SUb->NodeNum)
+    if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 0954fbb8314c6..5e3f436928fa8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -346,7 +346,7 @@ class RISCVPassConfig : public TargetPassConfig {
   createMachineScheduler(MachineSchedContext *C) const override {
     const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true));
     if (ST.hasMacroFusion())
       DAG->addMutation(createRISCVMacroFusionDAGMutation());
     return DAG;
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 3695a8a7f6086..823918f1c42e7 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -167,17 +167,17 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srli a5, a2, 29
 ; RV32I-NEXT:    slli a6, a3, 3
 ; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    srli a3, a3, 29
-; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    slli a6, a4, 3
 ; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    srli a1, a1, 29
-; RV32I-NEXT:    slli a4, a4, 3
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a4, 29
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a2, a2, 3
 ; RV32I-NEXT:    lui a4, 128
 ; RV32I-NEXT:    add a1, a1, a4
@@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    lw a6, 8(a1)
-; RV32C-NEXT:    c.lw a3, 12(a1)
-; RV32C-NEXT:    c.lw a2, 4(a1)
-; RV32C-NEXT:    c.lw a1, 0(a1)
+; RV32C-NEXT:    c.lw a2, 12(a1)
+; RV32C-NEXT:    lw a6, 0(a1)
+; RV32C-NEXT:    c.lw a3, 4(a1)
+; RV32C-NEXT:    c.lw a1, 8(a1)
 ; RV32C-NEXT:    c.lui a5, 16
-; RV32C-NEXT:    c.add a3, a5
-; RV32C-NEXT:    c.slli a3, 3
-; RV32C-NEXT:    srli a5, a6, 29
-; RV32C-NEXT:    c.or a3, a5
+; RV32C-NEXT:    c.add a2, a5
+; RV32C-NEXT:    c.slli a2, 3
 ; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    slli a4, a2, 3
+; RV32C-NEXT:    c.or a2, a5
+; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    slli a4, a3, 3
 ; RV32C-NEXT:    c.or a4, a5
-; RV32C-NEXT:    c.srli a2, 29
-; RV32C-NEXT:    c.slli a6, 3
-; RV32C-NEXT:    or a2, a6, a2
+; RV32C-NEXT:    c.srli a3, 29
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.sw a1, 0(a0)
-; RV32C-NEXT:    c.sw a2, 8(a0)
+; RV32C-NEXT:    c.or a1, a3
+; RV32C-NEXT:    c.slli a6, 3
+; RV32C-NEXT:    sw a6, 0(a0)
+; RV32C-NEXT:    c.sw a1, 8(a0)
 ; RV32C-NEXT:    c.sw a4, 4(a0)
-; RV32C-NEXT:    c.sw a3, 12(a0)
+; RV32C-NEXT:    c.sw a2, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
 ; RV64C-LABEL: add_wide_operand:
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
index 895852b84e004..9363b51056248 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
@@ -192,8 +192,8 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB11_2
@@ -268,8 +268,8 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB13_2
@@ -344,8 +344,8 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB15_2
@@ -420,8 +420,8 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB17_2
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index e97a1ea5dfca0..51ab65083fbb3 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -19066,8 +19066,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB220_2
@@ -19120,8 +19120,8 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB220_2
@@ -19219,8 +19219,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB221_2
@@ -19273,8 +19273,8 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB221_2
@@ -19377,8 +19377,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB222_2
@@ -19431,8 +19431,8 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB222_2
@@ -19535,8 +19535,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB223_2
@@ -19589,8 +19589,8 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB223_2
@@ -19693,8 +19693,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB224_2
@@ -19747,8 +19747,8 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB224_2
@@ -19851,8 +19851,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB225_2
@@ -19905,8 +19905,8 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB225_2
@@ -20004,8 +20004,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB226_2
@@ -20058,8 +20058,8 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB226_2
@@ -20162,8 +20162,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB227_2
@@ -20216,8 +20216,8 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB227_2
@@ -20320,8 +20320,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB228_2
@@ -20374,8 +20374,8 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB228_2
@@ -20478,8 +20478,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB229_2
@@ -20532,8 +20532,8 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB229_2
@@ -20636,8 +20636,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB230_2
@@ -20690,8 +20690,8 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB230_2
@@ -20789,8 +20789,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB231_2
@@ -20843,8 +20843,8 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB231_2
@@ -20947,8 +20947,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB232_2
@@ -21001,8 +21001,8 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB232_2
@@ -21105,8 +21105,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB233_2
@@ -21159,8 +21159,8 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB233_2
@@ -21263,8 +21263,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB234_2
@@ -21317,8 +21317,8 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB234_2
@@ -21421,8 +21421,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB235_2
@@ -21475,8 +21475,8 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB235_2
@@ -21574,8 +21574,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB236_2
@@ -21628,8 +21628,8 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB236_2
@@ -21732,8 +21732,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB237_2
@@ -21786,8 +21786,8 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB237_2
@@ -21890,8 +21890,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB238_2
@@ -21944,8 +21944,8 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB238_2
@@ -22048,8 +22048,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB239_2
@@ -22102,8 +22102,8 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB239_2
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index 2739fde250ee2..fb3ef57e272c3 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -3137,8 +3137,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB43_2
@@ -3191,8 +3191,8 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB43_2
@@ -3290,8 +3290,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB44_2
@@ -3344,8 +3344,8 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB44_2
@@ -3443,8 +3443,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB45_2
@@ -3497,8 +3497,8 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB45_2
@@ -3596,8 +3596,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB46_2
@@ -3650,8 +3650,8 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB46_2
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index 5f15a9c067102..c55b3cedbda50 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -468,8 +468,8 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
@@ -523,8 +523,8 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
@@ -1211,8 +1211,8 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB7_2
@@ -1274,8 +1274,8 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB7_2
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 3a93ac8966025..798e665c2128d 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 12(a1)
-; RV32I-FPELIM-NEXT:    lw a3, 8(a1)
-; RV32I-FPELIM-NEXT:    lw a4, 4(a1)
+; RV32I-FPELIM-NEXT:    lw a2, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a1)
 ; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a6, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a1, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32I-FPELIM-NEXT:    xor a2, a5, a2
-; RV32I-FPELIM-NEXT:    xor a4, a7, a4
-; RV32I-FPELIM-NEXT:    or a2, a4, a2
-; RV32I-FPELIM-NEXT:    xor a3, a6, a3
+; RV32I-FPELIM-NEXT:    lw a6, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a7, 4(a1)
+; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
+; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
+; RV32I-FPELIM-NEXT:    xor a4, a5, a4
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
+; RV32I-FPELIM-NEXT:    or a3, a3, a4
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
+; RV32I-FPELIM-NEXT:    xor a1, a2, a6
+; RV32I-FPELIM-NEXT:    or a0, a1, a0
 ; RV32I-FPELIM-NEXT:    or a0, a0, a3
-; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 12(a1)
-; RV32I-WITHFP-NEXT:    lw a3, 8(a1)
-; RV32I-WITHFP-NEXT:    lw a4, 4(a1)
+; RV32I-WITHFP-NEXT:    lw a2, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a1)
 ; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a6, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a1, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
-; RV32I-WITHFP-NEXT:    xor a2, a5, a2
-; RV32I-WITHFP-NEXT:    xor a4, a7, a4
-; RV32I-WITHFP-NEXT:    or a2, a4, a2
-; RV32I-WITHFP-NEXT:    xor a3, a6, a3
+; RV32I-WITHFP-NEXT:    lw a6, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a7, 4(a1)
+; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
+; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
+; RV32I-WITHFP-NEXT:    xor a4, a5, a4
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
+; RV32I-WITHFP-NEXT:    or a3, a3, a4
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
+; RV32I-WITHFP-NEXT:    xor a1, a2, a6
+; RV32I-WITHFP-NEXT:    or a0, a1, a0
 ; RV32I-WITHFP-NEXT:    or a0, a0, a3
-; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -297,21 +297,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lw a1, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a2, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a1, 0(a7)
+; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
+; RV32I-FPELIM-NEXT:    lw a3, 12(a0)
 ; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a5, 8(a7)
-; RV32I-FPELIM-NEXT:    lw a6, 4(a7)
-; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 0(a7)
-; RV32I-FPELIM-NEXT:    xor a1, a4, a1
-; RV32I-FPELIM-NEXT:    xor a3, a6, a3
-; RV32I-FPELIM-NEXT:    or a1, a3, a1
-; RV32I-FPELIM-NEXT:    xor a2, a5, a2
+; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
+; RV32I-FPELIM-NEXT:    xor a3, a4, a3
+; RV32I-FPELIM-NEXT:    xor a2, a2, a6
+; RV32I-FPELIM-NEXT:    or a2, a2, a3
 ; RV32I-FPELIM-NEXT:    xor a0, a7, a0
+; RV32I-FPELIM-NEXT:    xor a1, a1, a5
+; RV32I-FPELIM-NEXT:    or a0, a1, a0
 ; RV32I-FPELIM-NEXT:    or a0, a0, a2
-; RV32I-FPELIM-NEXT:    or a0, a0, a1
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -322,21 +322,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
-; RV32I-WITHFP-NEXT:    lw a1, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a2, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a1, 0(a7)
+; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
+; RV32I-WITHFP-NEXT:    lw a3, 12(a0)
 ; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a5, 8(a7)
-; RV32I-WITHFP-NEXT:    lw a6, 4(a7)
-; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 0(a7)
-; RV32I-WITHFP-NEXT:    xor a1, a4, a1
-; RV32I-WITHFP-NEXT:    xor a3, a6, a3
-; RV32I-WITHFP-NEXT:    or a1, a3, a1
-; RV32I-WITHFP-NEXT:    xor a2, a5, a2
+; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
+; RV32I-WITHFP-NEXT:    xor a3, a4, a3
+; RV32I-WITHFP-NEXT:    xor a2, a2, a6
+; RV32I-WITHFP-NEXT:    or a2, a2, a3
 ; RV32I-WITHFP-NEXT:    xor a0, a7, a0
+; RV32I-WITHFP-NEXT:    xor a1, a1, a5
+; RV32I-WITHFP-NEXT:    or a0, a1, a0
 ; RV32I-WITHFP-NEXT:    or a0, a0, a2
-; RV32I-WITHFP-NEXT:    or a0, a0, a1
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index 69ffbb0b2511d..fbec084882a7d 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 24(a1)
-; RV64I-NEXT:    ld a3, 16(a1)
-; RV64I-NEXT:    ld a4, 8(a1)
+; RV64I-NEXT:    ld a2, 0(a0)
+; RV64I-NEXT:    ld a3, 8(a0)
+; RV64I-NEXT:    ld a4, 24(a1)
 ; RV64I-NEXT:    ld a5, 24(a0)
-; RV64I-NEXT:    ld a6, 16(a0)
-; RV64I-NEXT:    ld a7, 8(a0)
-; RV64I-NEXT:    ld a1, 0(a1)
-; RV64I-NEXT:    ld a0, 0(a0)
-; RV64I-NEXT:    xor a2, a5, a2
-; RV64I-NEXT:    xor a4, a7, a4
-; RV64I-NEXT:    or a2, a4, a2
-; RV64I-NEXT:    xor a3, a6, a3
+; RV64I-NEXT:    ld a6, 0(a1)
+; RV64I-NEXT:    ld a7, 8(a1)
+; RV64I-NEXT:    ld a1, 16(a1)
+; RV64I-NEXT:    ld a0, 16(a0)
+; RV64I-NEXT:    xor a4, a5, a4
+; RV64I-NEXT:    xor a3, a3, a7
+; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    xor a1, a2, a6
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -161,21 +161,21 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-LABEL: callee_large_scalars_exhausted_regs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 8(sp)
-; RV64I-NEXT:    ld a1, 24(a0)
-; RV64I-NEXT:    ld a2, 16(a0)
-; RV64I-NEXT:    ld a3, 8(a0)
+; RV64I-NEXT:    ld a1, 0(a7)
+; RV64I-NEXT:    ld a2, 8(a7)
+; RV64I-NEXT:    ld a3, 24(a0)
 ; RV64I-NEXT:    ld a4, 24(a7)
-; RV64I-NEXT:    ld a5, 16(a7)
-; RV64I-NEXT:    ld a6, 8(a7)
-; RV64I-NEXT:    ld a0, 0(a0)
-; RV64I-NEXT:    ld a7, 0(a7)
-; RV64I-NEXT:    xor a1, a4, a1
-; RV64I-NEXT:    xor a3, a6, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    xor a2, a5, a2
+; RV64I-NEXT:    ld a5, 0(a0)
+; RV64I-NEXT:    ld a6, 8(a0)
+; RV64I-NEXT:    ld a0, 16(a0)
+; RV64I-NEXT:    ld a7, 16(a7)
+; RV64I-NEXT:    xor a3, a4, a3
+; RV64I-NEXT:    xor a2, a2, a6
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    xor a0, a7, a0
+; RV64I-NEXT:    xor a1, a1, a5
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index f2079e314d51c..282fcf01a4bd1 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3348,8 +3348,8 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB49_2
 ; RV32-NEXT:  .LBB49_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB49_2 Depth=1
@@ -3453,8 +3453,8 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB50_2
 ; RV32-NEXT:  .LBB50_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
@@ -3560,8 +3560,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB51_2
 ; RV32-NEXT:  .LBB51_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
@@ -3652,8 +3652,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    j .LBB52_2
 ; RV32-NEXT:  .LBB52_1: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
@@ -3802,8 +3802,8 @@ define double @rmw64_fadd_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:  .LBB54_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
@@ -3937,8 +3937,8 @@ define double @rmw64_fsub_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:  .LBB55_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 786176
@@ -4072,8 +4072,8 @@ define double @rmw64_fmin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:  .LBB56_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
@@ -4207,8 +4207,8 @@ define double @rmw64_fmax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:    lw s2, 0(a0)
+; RV32-NEXT:    lw s1, 4(a0)
 ; RV32-NEXT:  .LBB57_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lui a3, 261888
@@ -4531,10 +4531,10 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a1
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a2, 8(a1)
 ; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    lw a2, 8(s0)
-; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    lw a4, 0(s0)
 ; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 46b7da2ddc210..6e276b7a7a597 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -302,27 +302,27 @@ define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bgez a2, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a4
-; RV32I-NEXT:    or a6, a1, a3
+; RV32I-NEXT:    neg a5, a1
+; RV32I-NEXT:    or a6, a4, a3
 ; RV32I-NEXT:    snez a6, a6
 ; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a4, a4
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a4, a5, a6
-; RV32I-NEXT:    snez a5, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a2, a1, a7
+; RV32I-NEXT:    sub a1, a5, a6
+; RV32I-NEXT:    snez a5, a4
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    neg a4, a4
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    sw a1, 0(a0)
-; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
@@ -330,27 +330,27 @@ define i128 @abs128(i128 %x) {
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 0(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a1, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a4
-; RV32ZBB-NEXT:    or a6, a1, a3
+; RV32ZBB-NEXT:    neg a5, a1
+; RV32ZBB-NEXT:    or a6, a4, a3
 ; RV32ZBB-NEXT:    snez a6, a6
 ; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a4, a4
-; RV32ZBB-NEXT:    add a2, a2, a4
-; RV32ZBB-NEXT:    neg a2, a2
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a4, a5, a6
-; RV32ZBB-NEXT:    snez a5, a1
+; RV32ZBB-NEXT:    snez a1, a1
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a2, a1, a7
+; RV32ZBB-NEXT:    sub a1, a5, a6
+; RV32ZBB-NEXT:    snez a5, a4
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    neg a4, a4
 ; RV32ZBB-NEXT:  .LBB8_2:
-; RV32ZBB-NEXT:    sw a1, 0(a0)
-; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a1, 8(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
@@ -384,27 +384,27 @@ define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bgez a2, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a4
-; RV32I-NEXT:    or a6, a1, a3
+; RV32I-NEXT:    neg a5, a1
+; RV32I-NEXT:    or a6, a4, a3
 ; RV32I-NEXT:    snez a6, a6
 ; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a4, a4
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a4, a5, a6
-; RV32I-NEXT:    snez a5, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a2, a1, a7
+; RV32I-NEXT:    sub a1, a5, a6
+; RV32I-NEXT:    snez a5, a4
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    neg a4, a4
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a1, 0(a0)
-; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
@@ -412,27 +412,27 @@ define i128 @select_abs128(i128 %x) {
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a4, 0(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a1, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a4
-; RV32ZBB-NEXT:    or a6, a1, a3
+; RV32ZBB-NEXT:    neg a5, a1
+; RV32ZBB-NEXT:    or a6, a4, a3
 ; RV32ZBB-NEXT:    snez a6, a6
 ; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a4, a4
-; RV32ZBB-NEXT:    add a2, a2, a4
-; RV32ZBB-NEXT:    neg a2, a2
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a4, a5, a6
-; RV32ZBB-NEXT:    snez a5, a1
+; RV32ZBB-NEXT:    snez a1, a1
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a2, a1, a7
+; RV32ZBB-NEXT:    sub a1, a5, a6
+; RV32ZBB-NEXT:    snez a5, a4
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    neg a4, a4
 ; RV32ZBB-NEXT:  .LBB9_2:
-; RV32ZBB-NEXT:    sw a1, 0(a0)
-; RV32ZBB-NEXT:    sw a4, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a1, 8(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 8ce5031780c8a..5b296274e552f 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -7,14 +7,14 @@
 define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV32-LABEL: ctz_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a3, 8(a0)
-; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a3, 0(a0)
 ; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a4, 8(a0)
 ; RV32-NEXT:    lw a2, 12(a0)
-; RV32-NEXT:    seqz a0, a4
+; RV32-NEXT:    seqz a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    andi a0, a0, 4
-; RV32-NEXT:    seqz a3, a3
+; RV32-NEXT:    seqz a3, a4
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    andi a3, a3, 2
 ; RV32-NEXT:    bltu a3, a0, .LBB0_2
@@ -40,14 +40,14 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ;
 ; RV64-LABEL: ctz_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a3, 16(a0)
-; RV64-NEXT:    lw a4, 0(a0)
+; RV64-NEXT:    lw a3, 0(a0)
 ; RV64-NEXT:    lw a1, 8(a0)
+; RV64-NEXT:    lw a4, 16(a0)
 ; RV64-NEXT:    lw a2, 24(a0)
-; RV64-NEXT:    seqz a0, a4
+; RV64-NEXT:    seqz a0, a3
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    andi a0, a0, 4
-; RV64-NEXT:    seqz a3, a3
+; RV64-NEXT:    seqz a3, a4
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    andi a3, a3, 2
 ; RV64-NEXT:    bltu a3, a0, .LBB0_2
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index 6d871dccbfcd6..dfd62e8d5f9f5 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -56,16 +56,16 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 8(a1)
-; RV32-NEXT:    lw a3, 12(a1)
-; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a2, 12(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a4, 8(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lui a5, 524288
-; RV32-NEXT:    xor a3, a3, a5
-; RV32-NEXT:    sw a2, 8(a0)
+; RV32-NEXT:    xor a2, a2, a5
+; RV32-NEXT:    sw a4, 8(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a4, 4(a0)
-; RV32-NEXT:    sw a3, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    sw a2, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test3:
diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index 0e5867800e935..22199eedc231c 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -223,10 +223,10 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset s2, -32
 ; RV64IFD-NEXT:    .cfi_offset fs0, -40
 ; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu a2, 8(a1)
 ; RV64IFD-NEXT:    lhu s2, 16(a1)
-; RV64IFD-NEXT:    lhu a1, 8(a1)
 ; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a1
+; RV64IFD-NEXT:    fmv.w.x fa0, a2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
@@ -351,10 +351,10 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset fs2, -64
 ; RV64IFD-NEXT:    lhu s1, 0(a1)
 ; RV64IFD-NEXT:    lhu s2, 8(a1)
+; RV64IFD-NEXT:    lhu a2, 16(a1)
 ; RV64IFD-NEXT:    lhu s3, 24(a1)
-; RV64IFD-NEXT:    lhu a1, 16(a1)
 ; RV64IFD-NEXT:    mv s0, a0
-; RV64IFD-NEXT:    fmv.w.x fa0, a1
+; RV64IFD-NEXT:    fmv.w.x fa0, a2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index f8a8cfc60f14d..b15c515c9b800 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -483,25 +483,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    lw a1, 8(sp)
 ; RV32I-NEXT:    lw a2, 12(sp)
@@ -509,7 +509,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV32I-NEXT:    lw a4, 20(sp)
 ; RV32I-NEXT:    sw a0, 12(s3)
 ; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw s2, 4(s3)
+; RV32I-NEXT:    sw s0, 4(s3)
 ; RV32I-NEXT:    sw s4, 0(s3)
 ; RV32I-NEXT:    sw a4, 28(s3)
 ; RV32I-NEXT:    sw a3, 24(s3)
@@ -533,25 +533,25 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    lw a1, 0(sp)
 ; RV64I-NEXT:    lw a2, 4(sp)
@@ -559,7 +559,7 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; RV64I-NEXT:    lw a4, 12(sp)
 ; RV64I-NEXT:    sw a0, 12(s3)
 ; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw s2, 4(s3)
+; RV64I-NEXT:    sw s0, 4(s3)
 ; RV64I-NEXT:    sw s4, 0(s3)
 ; RV64I-NEXT:    sw a4, 28(s3)
 ; RV64I-NEXT:    sw a3, 24(s3)
@@ -751,29 +751,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    addi a1, sp, 12
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    addi a1, sp, 16
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    addi a1, sp, 20
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    sw a0, 12(s3)
 ; RV32I-NEXT:    sw s1, 8(s3)
-; RV32I-NEXT:    sw s2, 4(s3)
+; RV32I-NEXT:    sw s0, 4(s3)
 ; RV32I-NEXT:    sw s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -793,29 +793,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    mv a1, sp
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    addi a1, sp, 4
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    sw a0, 12(s3)
 ; RV64I-NEXT:    sw s1, 8(s3)
-; RV64I-NEXT:    sw s2, 4(s3)
+; RV64I-NEXT:    sw s0, 4(s3)
 ; RV64I-NEXT:    sw s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -999,22 +999,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32I-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 12(a1)
-; RV32I-NEXT:    lw s1, 8(a1)
-; RV32I-NEXT:    lw s2, 4(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw s0, 4(a1)
+; RV32I-NEXT:    lw s1, 8(a1)
+; RV32I-NEXT:    lw s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    addi a1, sp, 12
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    addi a1, sp, 16
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    addi a1, sp, 20
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    addi a1, sp, 24
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call frexpf at plt
 ; RV32I-NEXT:    lw a0, 24(sp)
 ; RV32I-NEXT:    lw a1, 20(sp)
@@ -1040,22 +1040,22 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV64I-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw s0, 24(a1)
-; RV64I-NEXT:    lw s1, 16(a1)
-; RV64I-NEXT:    lw s2, 8(a1)
 ; RV64I-NEXT:    lw a2, 0(a1)
+; RV64I-NEXT:    lw s0, 8(a1)
+; RV64I-NEXT:    lw s1, 16(a1)
+; RV64I-NEXT:    lw s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    addi a1, sp, 8
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    addi a1, sp, 12
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    addi a1, sp, 16
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    addi a1, sp, 20
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call frexpf at plt
 ; RV64I-NEXT:    lw a0, 20(sp)
 ; RV64I-NEXT:    lw a1, 16(sp)
diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index d831a2a002a3e..e9f2d1190260e 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -25,16 +25,16 @@ define i32 @t0() {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    lui a0, %hi(src)
 ; RV32-NEXT:    lw a1, %lo(src)(a0)
-; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    addi a0, a0, %lo(src)
-; RV32-NEXT:    lbu a1, 10(a0)
+; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    lh a3, 8(a0)
-; RV32-NEXT:    lw a0, 4(a0)
-; RV32-NEXT:    addi a2, a2, %lo(dst)
-; RV32-NEXT:    sb a1, 10(a2)
-; RV32-NEXT:    sh a3, 8(a2)
-; RV32-NEXT:    sw a0, 4(a2)
+; RV32-NEXT:    lbu a0, 10(a0)
+; RV32-NEXT:    lui a4, %hi(dst)
+; RV32-NEXT:    sw a1, %lo(dst)(a4)
+; RV32-NEXT:    addi a1, a4, %lo(dst)
+; RV32-NEXT:    sb a0, 10(a1)
+; RV32-NEXT:    sh a3, 8(a1)
+; RV32-NEXT:    sw a2, 4(a1)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
 ;
@@ -42,14 +42,14 @@ define i32 @t0() {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lui a0, %hi(src)
 ; RV64-NEXT:    ld a1, %lo(src)(a0)
-; RV64-NEXT:    lui a2, %hi(dst)
 ; RV64-NEXT:    addi a0, a0, %lo(src)
-; RV64-NEXT:    lbu a3, 10(a0)
-; RV64-NEXT:    lh a0, 8(a0)
-; RV64-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-NEXT:    addi a1, a2, %lo(dst)
-; RV64-NEXT:    sb a3, 10(a1)
-; RV64-NEXT:    sh a0, 8(a1)
+; RV64-NEXT:    lh a2, 8(a0)
+; RV64-NEXT:    lbu a0, 10(a0)
+; RV64-NEXT:    lui a3, %hi(dst)
+; RV64-NEXT:    sd a1, %lo(dst)(a3)
+; RV64-NEXT:    addi a1, a3, %lo(dst)
+; RV64-NEXT:    sb a0, 10(a1)
+; RV64-NEXT:    sh a2, 8(a1)
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
 ;
@@ -57,14 +57,14 @@ define i32 @t0() {
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a0, %hi(src)
 ; RV32-FAST-NEXT:    lw a1, %lo(src)(a0)
-; RV32-FAST-NEXT:    lui a2, %hi(dst)
 ; RV32-FAST-NEXT:    addi a0, a0, %lo(src)
-; RV32-FAST-NEXT:    lw a3, 7(a0)
-; RV32-FAST-NEXT:    lw a0, 4(a0)
-; RV32-FAST-NEXT:    sw a1, %lo(dst)(a2)
-; RV32-FAST-NEXT:    addi a1, a2, %lo(dst)
-; RV32-FAST-NEXT:    sw a3, 7(a1)
-; RV32-FAST-NEXT:    sw a0, 4(a1)
+; RV32-FAST-NEXT:    lw a2, 4(a0)
+; RV32-FAST-NEXT:    lw a0, 7(a0)
+; RV32-FAST-NEXT:    lui a3, %hi(dst)
+; RV32-FAST-NEXT:    sw a1, %lo(dst)(a3)
+; RV32-FAST-NEXT:    addi a1, a3, %lo(dst)
+; RV32-FAST-NEXT:    sw a0, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 4(a1)
 ; RV32-FAST-NEXT:    li a0, 0
 ; RV32-FAST-NEXT:    ret
 ;
@@ -166,16 +166,16 @@ define void @t2(ptr nocapture %C) nounwind {
 ; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
 ; RV64-FAST-NEXT:    ld a2, %lo(.L.str2)(a1)
 ; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    lui a2, 1156
-; RV64-FAST-NEXT:    addi a2, a2, 332
 ; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    ld a3, 24(a1)
-; RV64-FAST-NEXT:    ld a4, 16(a1)
-; RV64-FAST-NEXT:    ld a1, 8(a1)
-; RV64-FAST-NEXT:    sw a2, 32(a0)
-; RV64-FAST-NEXT:    sd a3, 24(a0)
-; RV64-FAST-NEXT:    sd a4, 16(a0)
-; RV64-FAST-NEXT:    sd a1, 8(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
+; RV64-FAST-NEXT:    ld a3, 16(a1)
+; RV64-FAST-NEXT:    ld a1, 24(a1)
+; RV64-FAST-NEXT:    lui a4, 1156
+; RV64-FAST-NEXT:    addi a4, a4, 332
+; RV64-FAST-NEXT:    sw a4, 32(a0)
+; RV64-FAST-NEXT:    sd a1, 24(a0)
+; RV64-FAST-NEXT:    sd a3, 16(a0)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index 023e95747c2cc..069d38460bb19 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; REQUIRES: asserts
 ; RUN: llc -mtriple=riscv32 -verify-misched -debug-only=machine-scheduler -o - 2>&1 < %s \
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
@@ -5,13 +6,6 @@
 ; RUN:   | FileCheck -check-prefix=LDCLUSTER %s
 
 define i32 @load_clustering_1(ptr nocapture %p) {
-; LDCLUSTER: ********** MI Scheduling **********
-; LDCLUSTER-LABEL: load_clustering_1:%bb.0
-; LDCLUSTER: *** Final schedule for %bb.0 ***
-; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
-; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
-; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
-; LDCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
 entry:
   %arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
   %val0 = load i32, i32* %arrayidx0
@@ -26,3 +20,5 @@ entry:
   %tmp2 = add i32 %tmp1, %val3
   ret i32 %tmp2
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; LDCLUSTER: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 33ab4fbaaf66e..dd6131e064cac 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1168,16 +1168,16 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    lw a2, 0(a1)
-; RV32IM-NEXT:    lw a3, 12(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
 ; RV32IM-NEXT:    lw a4, 8(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -15
 ; RV32IM-NEXT:    slli a5, a5, 8
 ; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    mul a7, a1, a5
+; RV32IM-NEXT:    mul a7, a3, a5
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a1, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
 ; RV32IM-NEXT:    add a7, t0, a7
 ; RV32IM-NEXT:    sub a6, a6, a2
 ; RV32IM-NEXT:    neg t0, a2
@@ -1186,25 +1186,25 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a1
+; RV32IM-NEXT:    sub t4, t1, a3
 ; RV32IM-NEXT:    mul t5, a4, a5
 ; RV32IM-NEXT:    sub t5, t5, a2
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a1
+; RV32IM-NEXT:    neg s1, a3
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a1, t2
+; RV32IM-NEXT:    mulhu t1, a3, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    mul a3, a3, a5
+; RV32IM-NEXT:    mul a1, a1, a5
 ; RV32IM-NEXT:    mulhu t1, a4, a5
 ; RV32IM-NEXT:    sub a4, t1, a4
-; RV32IM-NEXT:    add a3, a4, a3
-; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    sub a1, t3, a1
-; RV32IM-NEXT:    add a1, a1, a3
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a3, a2, a3
+; RV32IM-NEXT:    sub a3, t3, a3
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
@@ -1252,39 +1252,39 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    slli a3, a2, 6
-; RV32I-NEXT:    sltu a5, a2, a3
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    slli a1, a2, 6
+; RV32I-NEXT:    sltu a4, a2, a1
 ; RV32I-NEXT:    srli a7, a2, 26
-; RV32I-NEXT:    slli t0, a1, 6
+; RV32I-NEXT:    slli t0, a3, 6
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    mv t0, a5
-; RV32I-NEXT:    beq a1, a7, .LBB31_2
+; RV32I-NEXT:    mv t0, a4
+; RV32I-NEXT:    beq a3, a7, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a7
+; RV32I-NEXT:    sltu t0, a3, a7
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    srli t1, a1, 26
+; RV32I-NEXT:    srli t1, a3, 26
 ; RV32I-NEXT:    slli t2, a6, 6
 ; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    sub t2, a6, t1
 ; RV32I-NEXT:    sltu t3, t2, t0
 ; RV32I-NEXT:    sltu t1, a6, t1
 ; RV32I-NEXT:    srli a6, a6, 26
-; RV32I-NEXT:    slli t4, a4, 6
+; RV32I-NEXT:    slli t4, a5, 6
 ; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    sub a4, a4, a6
-; RV32I-NEXT:    sub a4, a4, t1
-; RV32I-NEXT:    sub a4, a4, t3
+; RV32I-NEXT:    sub a5, a5, a6
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a5, a5, t3
 ; RV32I-NEXT:    sub a6, t2, t0
-; RV32I-NEXT:    sub a1, a1, a7
-; RV32I-NEXT:    sub a1, a1, a5
-; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a3, a3, a4
+; RV32I-NEXT:    sub a2, a2, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m63:
@@ -1292,52 +1292,52 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 4(a1)
-; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 12(a1)
-; RV32IM-NEXT:    lw a1, 8(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
+; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -63
-; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    slli a7, a2, 6
-; RV32IM-NEXT:    sub a7, a2, a7
+; RV32IM-NEXT:    mulhu a6, a2, a5
+; RV32IM-NEXT:    slli a7, a3, 6
+; RV32IM-NEXT:    sub a7, a3, a7
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a2, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a3
-; RV32IM-NEXT:    neg t0, a3
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a3, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a2
-; RV32IM-NEXT:    slli t5, a1, 6
-; RV32IM-NEXT:    sub t6, a1, a3
+; RV32IM-NEXT:    sub t4, t1, a3
+; RV32IM-NEXT:    slli t5, a4, 6
+; RV32IM-NEXT:    sub t6, a4, a2
 ; RV32IM-NEXT:    sub t5, t6, t5
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a2
+; RV32IM-NEXT:    neg s1, a3
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a2, t2
+; RV32IM-NEXT:    mulhu t1, a3, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a4, 6
-; RV32IM-NEXT:    sub a4, a4, t1
-; RV32IM-NEXT:    mulhu a5, a1, a5
-; RV32IM-NEXT:    sub a5, a5, a1
-; RV32IM-NEXT:    add a4, a5, a4
-; RV32IM-NEXT:    add a2, a3, a2
-; RV32IM-NEXT:    sub a1, t3, a2
-; RV32IM-NEXT:    add a1, a1, a4
+; RV32IM-NEXT:    slli t1, a1, 6
+; RV32IM-NEXT:    sub a1, a1, t1
+; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    sub a5, a5, a4
+; RV32IM-NEXT:    add a1, a5, a1
+; RV32IM-NEXT:    add a3, a2, a3
+; RV32IM-NEXT:    sub a3, t3, a3
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    slli a2, a3, 6
-; RV32IM-NEXT:    sub a3, a3, a2
-; RV32IM-NEXT:    sw a3, 0(a0)
+; RV32IM-NEXT:    slli a3, a2, 6
+; RV32IM-NEXT:    sub a2, a2, a3
+; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 7c3294fa81dcf..71b9d9b8c5dca 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -1241,8 +1241,8 @@ define i64 @foo2(ptr %p) {
 define void @PR41129(ptr %p64) {
 ; RV32-LABEL: PR41129:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
 ; RV32-NEXT:    or a3, a1, a2
 ; RV32-NEXT:    beqz a3, .LBB37_2
 ; RV32-NEXT:  # %bb.1: # %false
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index 6a605b2cc53ae..1c5b42f038b17 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -8,23 +8,23 @@
 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_sum_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lw a2, 0(a0)
-; RV32-NEXT:    lw a3, 12(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a0, a3, a0
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_sum_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 8(a0)
-; RV64-NEXT:    lw a2, 0(a0)
-; RV64-NEXT:    lw a3, 24(a0)
-; RV64-NEXT:    lw a0, 16(a0)
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a2, 8(a0)
+; RV64-NEXT:    lw a3, 16(a0)
+; RV64-NEXT:    lw a0, 24(a0)
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a0, a3, a0
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
@@ -40,23 +40,23 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_xor_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lw a2, 0(a0)
-; RV32-NEXT:    lw a3, 12(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    xor a1, a2, a1
-; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_xor_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 8(a0)
-; RV64-NEXT:    ld a2, 0(a0)
-; RV64-NEXT:    ld a3, 24(a0)
-; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    xor a1, a2, a1
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    ld a1, 0(a0)
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 16(a0)
+; RV64-NEXT:    ld a0, 24(a0)
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
@@ -72,23 +72,23 @@ define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 define i32 @reduce_or_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_or_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 4(a0)
-; RV32-NEXT:    lw a2, 0(a0)
-; RV32-NEXT:    lw a3, 12(a0)
-; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    or a1, a2, a1
-; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a0, 12(a0)
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_or_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 8(a0)
-; RV64-NEXT:    ld a2, 0(a0)
-; RV64-NEXT:    ld a3, 24(a0)
-; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    or a1, a2, a1
-; RV64-NEXT:    or a0, a0, a3
+; RV64-NEXT:    ld a1, 0(a0)
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 16(a0)
+; RV64-NEXT:    ld a0, 24(a0)
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    or a0, a3, a0
 ; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 56fe3340c83e7..5a91481572552 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -749,22 +749,22 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s5, 12(a1)
-; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 4(a1)
+; RV32I-NEXT:    lw s5, 0(a1)
 ; RV32I-NEXT:    lw s2, 8(a1)
-; RV32I-NEXT:    lw s6, 0(a1)
-; RV32I-NEXT:    srli a0, a2, 1
-; RV32I-NEXT:    lui a1, 349525
-; RV32I-NEXT:    addi s3, a1, 1365
-; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi s4, a0, 819
-; RV32I-NEXT:    and a0, a2, s4
-; RV32I-NEXT:    srli a2, a2, 2
-; RV32I-NEXT:    and a1, a2, s4
-; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw s6, 12(a1)
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi s3, a2, 1365
+; RV32I-NEXT:    and a1, a1, s3
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi s4, a1, 819
+; RV32I-NEXT:    and a1, a0, s4
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s4
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    lui a1, 61681
@@ -775,9 +775,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __mulsi3 at plt
 ; RV32I-NEXT:    srli s8, a0, 24
-; RV32I-NEXT:    srli a0, s6, 1
+; RV32I-NEXT:    srli a0, s5, 1
 ; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    sub a0, s6, a0
+; RV32I-NEXT:    sub a0, s5, a0
 ; RV32I-NEXT:    and a1, a0, s4
 ; RV32I-NEXT:    srli a0, a0, 2
 ; RV32I-NEXT:    and a0, a0, s4
@@ -789,9 +789,9 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    call __mulsi3 at plt
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    add s8, a0, s8
-; RV32I-NEXT:    srli a0, s5, 1
+; RV32I-NEXT:    srli a0, s6, 1
 ; RV32I-NEXT:    and a0, a0, s3
-; RV32I-NEXT:    sub a0, s5, a0
+; RV32I-NEXT:    sub a0, s6, a0
 ; RV32I-NEXT:    and a1, a0, s4
 ; RV32I-NEXT:    srli a0, a0, 2
 ; RV32I-NEXT:    and a0, a0, s4
@@ -835,20 +835,20 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    cpop a4, a4
-; RV32ZBB-NEXT:    add a3, a4, a3
+; RV32ZBB-NEXT:    lw a2, 4(a1)
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 12(a1)
 ; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    cpop a3, a3
+; RV32ZBB-NEXT:    add a2, a3, a2
 ; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    cpop a3, a4
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sw zero, 12(a0)
 ; RV32ZBB-NEXT:    sw zero, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %1
@@ -858,21 +858,21 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ult_two:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 12(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
+; RV32I-NEXT:    lw a4, 12(a0)
+; RV32I-NEXT:    addi a0, a1, -1
+; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    addi a1, a3, -1
 ; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    seqz a2, a3
+; RV32I-NEXT:    sub a2, a4, a2
+; RV32I-NEXT:    and a2, a4, a2
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    ret
@@ -901,21 +901,21 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ugt_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a1, 0(a0)
-; RV32I-NEXT:    lw a2, 12(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    addi a4, a1, -1
-; RV32I-NEXT:    and a4, a1, a4
+; RV32I-NEXT:    lw a4, 12(a0)
+; RV32I-NEXT:    addi a0, a1, -1
+; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    seqz a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a1, a3, -1
 ; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    seqz a2, a3
+; RV32I-NEXT:    sub a2, a4, a2
+; RV32I-NEXT:    and a2, a4, a2
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    ret
@@ -946,15 +946,15 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_eq_one:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a1, a0
-; RV32I-NEXT:    lw a2, 12(a0)
-; RV32I-NEXT:    lw a0, 4(a0)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    beqz a0, .LBB22_3
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    beqz a3, .LBB22_3
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a0, a3
-; RV32I-NEXT:    xor a0, a0, a3
-; RV32I-NEXT:    sltu a0, a3, a0
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    sub a0, a3, a0
+; RV32I-NEXT:    xor a3, a3, a0
+; RV32I-NEXT:    sltu a0, a0, a3
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    bnez a2, .LBB22_4
 ; RV32I-NEXT:  .LBB22_2:
@@ -963,9 +963,9 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sltu a1, a2, a1
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB22_3:
-; RV32I-NEXT:    addi a0, a3, -1
-; RV32I-NEXT:    xor a3, a3, a0
-; RV32I-NEXT:    sltu a0, a0, a3
+; RV32I-NEXT:    addi a3, a0, -1
+; RV32I-NEXT:    xor a0, a0, a3
+; RV32I-NEXT:    sltu a0, a3, a0
 ; RV32I-NEXT:    lw a1, 8(a1)
 ; RV32I-NEXT:    beqz a2, .LBB22_2
 ; RV32I-NEXT:  .LBB22_4:
@@ -1000,20 +1000,20 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64_ne_one:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a2, 0(a0)
+; RV32I-NEXT:    lw a3, 4(a0)
 ; RV32I-NEXT:    lw a1, 12(a0)
-; RV32I-NEXT:    lw a2, 4(a0)
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    beqz a2, .LBB23_2
+; RV32I-NEXT:    beqz a3, .LBB23_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    seqz a3, a3
-; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    xor a2, a2, a3
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    j .LBB23_3
-; RV32I-NEXT:  .LBB23_2:
-; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    seqz a2, a2
+; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    xor a3, a3, a2
 ; RV32I-NEXT:    sltu a2, a2, a3
+; RV32I-NEXT:    j .LBB23_3
+; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    addi a3, a2, -1
+; RV32I-NEXT:    xor a2, a2, a3
+; RV32I-NEXT:    sltu a2, a3, a2
 ; RV32I-NEXT:  .LBB23_3:
 ; RV32I-NEXT:    lw a3, 8(a0)
 ; RV32I-NEXT:    xori a0, a2, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index f6d1d3882e5e8..92b88054a1d3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -8,14 +8,14 @@ declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-LABEL: vpreduce_add_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a4, 12(a1)
-; RV32-NEXT:    lw a5, 8(a1)
-; RV32-NEXT:    lw a6, 4(a1)
-; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a5, 4(a1)
+; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
 ; RV32-NEXT:    lw a7, 0(a2)
-; RV32-NEXT:    lw t0, 12(a2)
+; RV32-NEXT:    lw t0, 4(a2)
 ; RV32-NEXT:    lw t1, 8(a2)
-; RV32-NEXT:    lw a2, 4(a2)
+; RV32-NEXT:    lw a2, 12(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
 ; RV32-NEXT:    xori t3, t3, 1
@@ -23,34 +23,34 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    sltiu a3, a3, 2
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a3, t4, t0
+; RV32-NEXT:    and a3, a3, t0
+; RV32-NEXT:    and a2, t4, a2
 ; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
-; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    and a4, a7, a4
 ; RV32-NEXT:    neg a7, t0
-; RV32-NEXT:    and a5, a7, a5
-; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a3, a3, a4
+; RV32-NEXT:    and a6, a7, a6
 ; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    and a2, a2, a6
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a1, a1, a5
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a2, a3
+; RV32-NEXT:    and a2, a2, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a1, a4, a1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpreduce_add_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a4, 24(a1)
-; RV64-NEXT:    lw a5, 16(a1)
-; RV64-NEXT:    lw a6, 8(a1)
-; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a4, 0(a1)
+; RV64-NEXT:    lw a5, 8(a1)
+; RV64-NEXT:    lw a6, 16(a1)
+; RV64-NEXT:    lw a1, 24(a1)
 ; RV64-NEXT:    ld a7, 0(a2)
-; RV64-NEXT:    ld t0, 24(a2)
+; RV64-NEXT:    ld t0, 8(a2)
 ; RV64-NEXT:    ld t1, 16(a2)
-; RV64-NEXT:    ld a2, 8(a2)
+; RV64-NEXT:    ld a2, 24(a2)
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
 ; RV64-NEXT:    sltiu t3, a3, 3
@@ -59,21 +59,21 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    sltiu a3, a3, 2
 ; RV64-NEXT:    xori a3, a3, 1
-; RV64-NEXT:    and a2, a3, a2
-; RV64-NEXT:    and a3, t4, t0
+; RV64-NEXT:    and a3, a3, t0
+; RV64-NEXT:    and a2, t4, a2
 ; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
-; RV64-NEXT:    and a1, a7, a1
+; RV64-NEXT:    and a4, a7, a4
 ; RV64-NEXT:    negw a7, t0
-; RV64-NEXT:    and a5, a7, a5
-; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    and a3, a3, a4
+; RV64-NEXT:    and a6, a7, a6
 ; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    and a2, a2, a6
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a1, a1, a5
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    negw a2, a3
+; RV64-NEXT:    and a2, a2, a5
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 24a7655dc35a1..81e20a2988163 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -26,38 +26,38 @@ define void @add_v4i32(ptr %x, ptr %y) {
 define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 12(a0)
-; RV32-NEXT:    lw a3, 0(a0)
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    lw a5, 12(a1)
-; RV32-NEXT:    lw a6, 4(a1)
-; RV32-NEXT:    lw a7, 0(a1)
-; RV32-NEXT:    lw t0, 8(a0)
-; RV32-NEXT:    lw a1, 8(a1)
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    add a7, a3, a7
-; RV32-NEXT:    sltu a3, a7, a3
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    add a2, a2, a5
-; RV32-NEXT:    add a1, t0, a1
-; RV32-NEXT:    sltu a4, a1, t0
-; RV32-NEXT:    add a2, a2, a4
-; RV32-NEXT:    sw a1, 8(a0)
-; RV32-NEXT:    sw a7, 0(a0)
-; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a5, 4(a0)
+; RV32-NEXT:    lw a6, 8(a0)
+; RV32-NEXT:    lw a7, 12(a0)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    sltu a4, a2, a4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a7, a1
+; RV32-NEXT:    add t0, a6, t0
+; RV32-NEXT:    sltu a4, t0, a6
+; RV32-NEXT:    add a1, a1, a4
+; RV32-NEXT:    sw t0, 8(a0)
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a1, 12(a0)
 ; RV32-NEXT:    sw a3, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 8(a0)
 ; RV64-NEXT:    ld a4, 0(a1)
 ; RV64-NEXT:    ld a1, 8(a1)
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    sd a1, 8(a0)
-; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    sd a2, 0(a0)
 ; RV64-NEXT:    ret
   %a = load <2 x i64>, ptr %x
   %b = load <2 x i64>, ptr %y
@@ -134,14 +134,14 @@ define void @fadd_v4f32(ptr %x, ptr %y) {
 define void @fadd_v2f64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fadd_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fld fa5, 8(a0)
-; CHECK-NEXT:    fld fa4, 0(a0)
+; CHECK-NEXT:    fld fa5, 0(a0)
+; CHECK-NEXT:    fld fa4, 8(a0)
 ; CHECK-NEXT:    fld fa3, 0(a1)
 ; CHECK-NEXT:    fld fa2, 8(a1)
-; CHECK-NEXT:    fadd.d fa4, fa4, fa3
-; CHECK-NEXT:    fadd.d fa5, fa5, fa2
-; CHECK-NEXT:    fsd fa5, 8(a0)
-; CHECK-NEXT:    fsd fa4, 0(a0)
+; CHECK-NEXT:    fadd.d fa5, fa5, fa3
+; CHECK-NEXT:    fadd.d fa4, fa4, fa2
+; CHECK-NEXT:    fsd fa4, 8(a0)
+; CHECK-NEXT:    fsd fa5, 0(a0)
 ; CHECK-NEXT:    ret
   %a = load <2 x double>, ptr %x
   %b = load <2 x double>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index ec11ada12eaa7..924d9a84ce927 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -134,12 +134,12 @@ define <3 x float> @si2fp_v3i1_v3f32(<3 x i1> %x) {
 define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV32:       # %bb.0:
-; LMULMAX8RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32-NEXT:    vadd.vv v8, v8, v8
@@ -151,12 +151,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV64:       # %bb.0:
-; LMULMAX8RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64-NEXT:    vadd.vv v8, v8, v8
@@ -168,12 +168,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV32-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX1RV32:       # %bb.0:
-; LMULMAX1RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX1RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX1RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV32-NEXT:    vadd.vv v8, v8, v8
@@ -185,12 +185,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV64-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX1RV64:       # %bb.0:
-; LMULMAX1RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX1RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX1RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV64-NEXT:    vadd.vv v8, v8, v8
@@ -202,12 +202,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV32ZVFHMIN-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV32ZVFHMIN:       # %bb.0:
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vadd.vv v8, v8, v8
@@ -219,12 +219,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64ZVFHMIN-LABEL: si2fp_v3i7_v3f32:
 ; LMULMAX8RV64ZVFHMIN:       # %bb.0:
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vadd.vv v8, v8, v8
@@ -241,12 +241,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; LMULMAX8RV32-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV32:       # %bb.0:
-; LMULMAX8RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32-NEXT:    li a0, 127
@@ -258,12 +258,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV64:       # %bb.0:
-; LMULMAX8RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64-NEXT:    li a0, 127
@@ -275,12 +275,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV32-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX1RV32:       # %bb.0:
-; LMULMAX1RV32-NEXT:    lw a1, 4(a0)
-; LMULMAX1RV32-NEXT:    lw a2, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a1, 0(a0)
+; LMULMAX1RV32-NEXT:    lw a2, 4(a0)
 ; LMULMAX1RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV32-NEXT:    lw a0, 8(a0)
-; LMULMAX1RV32-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV32-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV32-NEXT:    li a0, 127
@@ -292,12 +292,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX1RV64-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX1RV64:       # %bb.0:
-; LMULMAX1RV64-NEXT:    ld a1, 8(a0)
-; LMULMAX1RV64-NEXT:    ld a2, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a1, 0(a0)
+; LMULMAX1RV64-NEXT:    ld a2, 8(a0)
 ; LMULMAX1RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX1RV64-NEXT:    ld a0, 16(a0)
-; LMULMAX1RV64-NEXT:    vmv.v.x v8, a2
-; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX1RV64-NEXT:    vmv.v.x v8, a1
+; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX1RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX1RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX1RV64-NEXT:    li a0, 127
@@ -309,12 +309,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV32ZVFHMIN-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV32ZVFHMIN:       # %bb.0:
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 4(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a1, 0(a0)
+; LMULMAX8RV32ZVFHMIN-NEXT:    lw a2, 4(a0)
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV32ZVFHMIN-NEXT:    lw a0, 8(a0)
-; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV32ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV32ZVFHMIN-NEXT:    li a0, 127
@@ -326,12 +326,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; LMULMAX8RV64ZVFHMIN-LABEL: ui2fp_v3i7_v3f32:
 ; LMULMAX8RV64ZVFHMIN:       # %bb.0:
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 8(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a1, 0(a0)
+; LMULMAX8RV64ZVFHMIN-NEXT:    ld a2, 8(a0)
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; LMULMAX8RV64ZVFHMIN-NEXT:    ld a0, 16(a0)
-; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a2
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
 ; LMULMAX8RV64ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 1
 ; LMULMAX8RV64ZVFHMIN-NEXT:    li a0, 127
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 799e20074b042..bad3836c50603 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,25 +7,25 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 32(a0)
-; ZVE32X-NEXT:    ld a2, 24(a0)
-; ZVE32X-NEXT:    ld a3, 80(a0)
-; ZVE32X-NEXT:    ld a4, 72(a0)
-; ZVE32X-NEXT:    ld a5, 56(a0)
-; ZVE32X-NEXT:    ld a6, 48(a0)
-; ZVE32X-NEXT:    ld a7, 8(a0)
-; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    ld a1, 0(a0)
+; ZVE32X-NEXT:    ld a2, 8(a0)
+; ZVE32X-NEXT:    ld a3, 24(a0)
+; ZVE32X-NEXT:    ld a4, 32(a0)
+; ZVE32X-NEXT:    ld a5, 48(a0)
+; ZVE32X-NEXT:    ld a6, 56(a0)
+; ZVE32X-NEXT:    ld a7, 72(a0)
+; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    xor a3, a3, a4
+; ZVE32X-NEXT:    snez a3, a3
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a1
+; ZVE32X-NEXT:    vmv.s.x v8, a3
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
 ; ZVE32X-NEXT:    vmerge.vim v9, v8, 1, v0
-; ZVE32X-NEXT:    xor a0, a0, a7
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVE32X-NEXT:    xor a1, a1, a2
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v10, a1
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
@@ -36,9 +36,9 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a0, a6, a5
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v11, a0
+; ZVE32X-NEXT:    xor a1, a5, a6
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v11, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
@@ -48,8 +48,8 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v9, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a3, a4, a3
-; ZVE32X-NEXT:    snez a0, a3
+; ZVE32X-NEXT:    xor a0, a7, a0
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v10, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 221f9a005bc23..ae6c15cafcddd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3599,9 +3599,9 @@ define <1 x i64> @mgather_v1i64(<1 x ptr> %ptrs, <1 x i1> %m, <1 x i64> %passthr
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB42_2
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a0)
-; RV32ZVE32F-NEXT:    lw a0, 0(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    lw a0, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB42_2: # %else
 ; RV32ZVE32F-NEXT:    ret
 ;
@@ -3645,30 +3645,30 @@ define <2 x i64> @mgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a4, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB43_4
 ; RV32ZVE32F-NEXT:  .LBB43_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a1, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a1, 12(a1)
 ; RV32ZVE32F-NEXT:    j .LBB43_5
 ; RV32ZVE32F-NEXT:  .LBB43_3:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a4, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB43_2
 ; RV32ZVE32F-NEXT:  .LBB43_4: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a4, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a4, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB43_5: # %else2
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a1, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i64:
@@ -3718,60 +3718,60 @@ define <4 x i64> @mgather_v4i64(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB44_6
 ; RV32ZVE32F-NEXT:  .LBB44_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    bnez a7, .LBB44_7
 ; RV32ZVE32F-NEXT:  .LBB44_3:
-; RV32ZVE32F-NEXT:    lw a7, 20(a1)
-; RV32ZVE32F-NEXT:    lw t0, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw t0, 20(a1)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB44_8
 ; RV32ZVE32F-NEXT:  .LBB44_4:
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw a6, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 28(a1)
 ; RV32ZVE32F-NEXT:    j .LBB44_9
 ; RV32ZVE32F-NEXT:  .LBB44_5:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB44_2
 ; RV32ZVE32F-NEXT:  .LBB44_6: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    beqz a7, .LBB44_3
 ; RV32ZVE32F-NEXT:  .LBB44_7: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v9
-; RV32ZVE32F-NEXT:    lw a7, 4(t0)
-; RV32ZVE32F-NEXT:    lw t0, 0(t0)
+; RV32ZVE32F-NEXT:    lw a7, 0(t0)
+; RV32ZVE32F-NEXT:    lw t0, 4(t0)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB44_4
 ; RV32ZVE32F-NEXT:  .LBB44_8: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a6, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB44_9: # %else8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw t0, 16(a0)
-; RV32ZVE32F-NEXT:    sw a7, 20(a0)
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a6, 24(a0)
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v4i64:
@@ -3846,60 +3846,60 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) {
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB45_6
 ; RV32ZVE32F-NEXT:  .LBB45_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    bnez a7, .LBB45_7
 ; RV32ZVE32F-NEXT:  .LBB45_3:
-; RV32ZVE32F-NEXT:    lw a7, 20(a1)
-; RV32ZVE32F-NEXT:    lw t0, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw t0, 20(a1)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB45_8
 ; RV32ZVE32F-NEXT:  .LBB45_4:
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw a6, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 28(a1)
 ; RV32ZVE32F-NEXT:    j .LBB45_9
 ; RV32ZVE32F-NEXT:  .LBB45_5:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, a6, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB45_2
 ; RV32ZVE32F-NEXT:  .LBB45_6: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a7, a6, 4
 ; RV32ZVE32F-NEXT:    beqz a7, .LBB45_3
 ; RV32ZVE32F-NEXT:  .LBB45_7: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v9
-; RV32ZVE32F-NEXT:    lw a7, 4(t0)
-; RV32ZVE32F-NEXT:    lw t0, 0(t0)
+; RV32ZVE32F-NEXT:    lw a7, 0(t0)
+; RV32ZVE32F-NEXT:    lw t0, 4(t0)
 ; RV32ZVE32F-NEXT:    andi a6, a6, 8
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB45_4
 ; RV32ZVE32F-NEXT:  .LBB45_8: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a6, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB45_9: # %else8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw t0, 16(a0)
-; RV32ZVE32F-NEXT:    sw a7, 20(a0)
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a6, 24(a0)
+; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i64:
@@ -4025,77 +4025,77 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB47_8
 ; RV32ZVE32F-NEXT:  .LBB47_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a1)
-; RV32ZVE32F-NEXT:    lw a5, 8(a1)
+; RV32ZVE32F-NEXT:    lw a4, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 12(a1)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB47_9
 ; RV32ZVE32F-NEXT:  .LBB47_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
+; RV32ZVE32F-NEXT:    lw a6, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 20(a1)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB47_10
 ; RV32ZVE32F-NEXT:  .LBB47_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a1)
-; RV32ZVE32F-NEXT:    lw t2, 24(a1)
+; RV32ZVE32F-NEXT:    lw t1, 24(a1)
+; RV32ZVE32F-NEXT:    lw t2, 28(a1)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB47_11
 ; RV32ZVE32F-NEXT:  .LBB47_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a1)
-; RV32ZVE32F-NEXT:    lw t4, 32(a1)
+; RV32ZVE32F-NEXT:    lw t3, 32(a1)
+; RV32ZVE32F-NEXT:    lw t4, 36(a1)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB47_12
 ; RV32ZVE32F-NEXT:  .LBB47_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a1)
-; RV32ZVE32F-NEXT:    lw t6, 40(a1)
+; RV32ZVE32F-NEXT:    lw t5, 40(a1)
+; RV32ZVE32F-NEXT:    lw t6, 44(a1)
 ; RV32ZVE32F-NEXT:    j .LBB47_13
 ; RV32ZVE32F-NEXT:  .LBB47_7:
-; RV32ZVE32F-NEXT:    lw a2, 4(a1)
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a2, 0(a1)
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB47_2
 ; RV32ZVE32F-NEXT:  .LBB47_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB47_3
 ; RV32ZVE32F-NEXT:  .LBB47_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4109,42 +4109,42 @@ define <8 x i64> @mgather_v8i64(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i64> %passthr
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB47_17
 ; RV32ZVE32F-NEXT:  .LBB47_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a1)
-; RV32ZVE32F-NEXT:    lw a1, 56(a1)
+; RV32ZVE32F-NEXT:    lw t0, 56(a1)
+; RV32ZVE32F-NEXT:    lw a1, 60(a1)
 ; RV32ZVE32F-NEXT:    j .LBB47_18
 ; RV32ZVE32F-NEXT:  .LBB47_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a1)
-; RV32ZVE32F-NEXT:    lw s1, 48(a1)
+; RV32ZVE32F-NEXT:    lw s0, 48(a1)
+; RV32ZVE32F-NEXT:    lw s1, 52(a1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB47_15
 ; RV32ZVE32F-NEXT:  .LBB47_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a1)
-; RV32ZVE32F-NEXT:    lw a1, 0(a1)
+; RV32ZVE32F-NEXT:    lw t0, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:  .LBB47_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a1, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a1, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4272,77 +4272,77 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB48_8
 ; RV32ZVE32F-NEXT:  .LBB48_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB48_9
 ; RV32ZVE32F-NEXT:  .LBB48_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB48_10
 ; RV32ZVE32F-NEXT:  .LBB48_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB48_11
 ; RV32ZVE32F-NEXT:  .LBB48_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB48_12
 ; RV32ZVE32F-NEXT:  .LBB48_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB48_13
 ; RV32ZVE32F-NEXT:  .LBB48_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB48_2
 ; RV32ZVE32F-NEXT:  .LBB48_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB48_3
 ; RV32ZVE32F-NEXT:  .LBB48_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4356,42 +4356,42 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB48_17
 ; RV32ZVE32F-NEXT:  .LBB48_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB48_18
 ; RV32ZVE32F-NEXT:  .LBB48_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB48_15
 ; RV32ZVE32F-NEXT:  .LBB48_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB48_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4546,77 +4546,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB49_8
 ; RV32ZVE32F-NEXT:  .LBB49_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB49_9
 ; RV32ZVE32F-NEXT:  .LBB49_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB49_10
 ; RV32ZVE32F-NEXT:  .LBB49_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB49_11
 ; RV32ZVE32F-NEXT:  .LBB49_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB49_12
 ; RV32ZVE32F-NEXT:  .LBB49_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB49_13
 ; RV32ZVE32F-NEXT:  .LBB49_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB49_2
 ; RV32ZVE32F-NEXT:  .LBB49_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB49_3
 ; RV32ZVE32F-NEXT:  .LBB49_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4630,42 +4630,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB49_17
 ; RV32ZVE32F-NEXT:  .LBB49_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB49_18
 ; RV32ZVE32F-NEXT:  .LBB49_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB49_15
 ; RV32ZVE32F-NEXT:  .LBB49_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB49_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -4822,77 +4822,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB50_8
 ; RV32ZVE32F-NEXT:  .LBB50_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB50_9
 ; RV32ZVE32F-NEXT:  .LBB50_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB50_10
 ; RV32ZVE32F-NEXT:  .LBB50_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB50_11
 ; RV32ZVE32F-NEXT:  .LBB50_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB50_12
 ; RV32ZVE32F-NEXT:  .LBB50_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB50_13
 ; RV32ZVE32F-NEXT:  .LBB50_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB50_2
 ; RV32ZVE32F-NEXT:  .LBB50_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB50_3
 ; RV32ZVE32F-NEXT:  .LBB50_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -4906,42 +4906,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB50_17
 ; RV32ZVE32F-NEXT:  .LBB50_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB50_18
 ; RV32ZVE32F-NEXT:  .LBB50_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB50_15
 ; RV32ZVE32F-NEXT:  .LBB50_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB50_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5105,77 +5105,77 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB51_8
 ; RV32ZVE32F-NEXT:  .LBB51_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB51_9
 ; RV32ZVE32F-NEXT:  .LBB51_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB51_10
 ; RV32ZVE32F-NEXT:  .LBB51_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB51_11
 ; RV32ZVE32F-NEXT:  .LBB51_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB51_12
 ; RV32ZVE32F-NEXT:  .LBB51_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB51_13
 ; RV32ZVE32F-NEXT:  .LBB51_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB51_2
 ; RV32ZVE32F-NEXT:  .LBB51_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB51_3
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5189,42 +5189,42 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB51_17
 ; RV32ZVE32F-NEXT:  .LBB51_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB51_18
 ; RV32ZVE32F-NEXT:  .LBB51_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB51_15
 ; RV32ZVE32F-NEXT:  .LBB51_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB51_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5380,77 +5380,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB52_8
 ; RV32ZVE32F-NEXT:  .LBB52_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB52_9
 ; RV32ZVE32F-NEXT:  .LBB52_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB52_10
 ; RV32ZVE32F-NEXT:  .LBB52_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB52_11
 ; RV32ZVE32F-NEXT:  .LBB52_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB52_12
 ; RV32ZVE32F-NEXT:  .LBB52_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB52_13
 ; RV32ZVE32F-NEXT:  .LBB52_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB52_2
 ; RV32ZVE32F-NEXT:  .LBB52_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB52_3
 ; RV32ZVE32F-NEXT:  .LBB52_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB52_4
 ; RV32ZVE32F-NEXT:  .LBB52_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB52_5
 ; RV32ZVE32F-NEXT:  .LBB52_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB52_6
 ; RV32ZVE32F-NEXT:  .LBB52_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB52_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5464,42 +5464,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB52_17
 ; RV32ZVE32F-NEXT:  .LBB52_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB52_18
 ; RV32ZVE32F-NEXT:  .LBB52_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB52_15
 ; RV32ZVE32F-NEXT:  .LBB52_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB52_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5657,77 +5657,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB53_8
 ; RV32ZVE32F-NEXT:  .LBB53_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB53_9
 ; RV32ZVE32F-NEXT:  .LBB53_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB53_10
 ; RV32ZVE32F-NEXT:  .LBB53_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB53_11
 ; RV32ZVE32F-NEXT:  .LBB53_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB53_12
 ; RV32ZVE32F-NEXT:  .LBB53_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB53_13
 ; RV32ZVE32F-NEXT:  .LBB53_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB53_2
 ; RV32ZVE32F-NEXT:  .LBB53_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB53_3
 ; RV32ZVE32F-NEXT:  .LBB53_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB53_4
 ; RV32ZVE32F-NEXT:  .LBB53_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB53_5
 ; RV32ZVE32F-NEXT:  .LBB53_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB53_6
 ; RV32ZVE32F-NEXT:  .LBB53_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB53_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -5741,42 +5741,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB53_17
 ; RV32ZVE32F-NEXT:  .LBB53_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB53_18
 ; RV32ZVE32F-NEXT:  .LBB53_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB53_15
 ; RV32ZVE32F-NEXT:  .LBB53_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB53_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -5941,77 +5941,77 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB54_8
 ; RV32ZVE32F-NEXT:  .LBB54_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB54_9
 ; RV32ZVE32F-NEXT:  .LBB54_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB54_10
 ; RV32ZVE32F-NEXT:  .LBB54_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB54_11
 ; RV32ZVE32F-NEXT:  .LBB54_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB54_12
 ; RV32ZVE32F-NEXT:  .LBB54_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB54_13
 ; RV32ZVE32F-NEXT:  .LBB54_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB54_2
 ; RV32ZVE32F-NEXT:  .LBB54_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB54_3
 ; RV32ZVE32F-NEXT:  .LBB54_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB54_4
 ; RV32ZVE32F-NEXT:  .LBB54_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB54_5
 ; RV32ZVE32F-NEXT:  .LBB54_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB54_6
 ; RV32ZVE32F-NEXT:  .LBB54_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB54_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6025,42 +6025,42 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB54_17
 ; RV32ZVE32F-NEXT:  .LBB54_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB54_18
 ; RV32ZVE32F-NEXT:  .LBB54_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB54_15
 ; RV32ZVE32F-NEXT:  .LBB54_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB54_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6214,77 +6214,77 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB55_8
 ; RV32ZVE32F-NEXT:  .LBB55_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB55_9
 ; RV32ZVE32F-NEXT:  .LBB55_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB55_10
 ; RV32ZVE32F-NEXT:  .LBB55_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB55_11
 ; RV32ZVE32F-NEXT:  .LBB55_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB55_12
 ; RV32ZVE32F-NEXT:  .LBB55_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB55_13
 ; RV32ZVE32F-NEXT:  .LBB55_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB55_2
 ; RV32ZVE32F-NEXT:  .LBB55_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB55_3
 ; RV32ZVE32F-NEXT:  .LBB55_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB55_4
 ; RV32ZVE32F-NEXT:  .LBB55_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB55_5
 ; RV32ZVE32F-NEXT:  .LBB55_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB55_6
 ; RV32ZVE32F-NEXT:  .LBB55_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB55_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6298,42 +6298,42 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB55_17
 ; RV32ZVE32F-NEXT:  .LBB55_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB55_18
 ; RV32ZVE32F-NEXT:  .LBB55_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB55_15
 ; RV32ZVE32F-NEXT:  .LBB55_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB55_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6488,77 +6488,77 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB56_8
 ; RV32ZVE32F-NEXT:  .LBB56_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a2)
-; RV32ZVE32F-NEXT:    lw a5, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 12(a2)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB56_9
 ; RV32ZVE32F-NEXT:  .LBB56_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a2)
-; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 16(a2)
+; RV32ZVE32F-NEXT:    lw a7, 20(a2)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB56_10
 ; RV32ZVE32F-NEXT:  .LBB56_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a2)
-; RV32ZVE32F-NEXT:    lw t2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 28(a2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB56_11
 ; RV32ZVE32F-NEXT:  .LBB56_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a2)
-; RV32ZVE32F-NEXT:    lw t4, 32(a2)
+; RV32ZVE32F-NEXT:    lw t3, 32(a2)
+; RV32ZVE32F-NEXT:    lw t4, 36(a2)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB56_12
 ; RV32ZVE32F-NEXT:  .LBB56_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a2)
-; RV32ZVE32F-NEXT:    lw t6, 40(a2)
+; RV32ZVE32F-NEXT:    lw t5, 40(a2)
+; RV32ZVE32F-NEXT:    lw t6, 44(a2)
 ; RV32ZVE32F-NEXT:    j .LBB56_13
 ; RV32ZVE32F-NEXT:  .LBB56_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a3, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a3, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB56_2
 ; RV32ZVE32F-NEXT:  .LBB56_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB56_3
 ; RV32ZVE32F-NEXT:  .LBB56_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB56_4
 ; RV32ZVE32F-NEXT:  .LBB56_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB56_5
 ; RV32ZVE32F-NEXT:  .LBB56_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB56_6
 ; RV32ZVE32F-NEXT:  .LBB56_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB56_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6572,42 +6572,42 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB56_17
 ; RV32ZVE32F-NEXT:  .LBB56_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a2)
-; RV32ZVE32F-NEXT:    lw a2, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 56(a2)
+; RV32ZVE32F-NEXT:    lw a2, 60(a2)
 ; RV32ZVE32F-NEXT:    j .LBB56_18
 ; RV32ZVE32F-NEXT:  .LBB56_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a2)
-; RV32ZVE32F-NEXT:    lw s1, 48(a2)
+; RV32ZVE32F-NEXT:    lw s0, 48(a2)
+; RV32ZVE32F-NEXT:    lw s1, 52(a2)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB56_15
 ; RV32ZVE32F-NEXT:  .LBB56_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:  .LBB56_18: # %else20
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a2, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -6760,22 +6760,22 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a4, 56(a2)
-; RV32ZVE32F-NEXT:    lw a5, 48(a2)
-; RV32ZVE32F-NEXT:    lw a6, 40(a2)
+; RV32ZVE32F-NEXT:    lw a4, 8(a2)
+; RV32ZVE32F-NEXT:    lw a5, 16(a2)
+; RV32ZVE32F-NEXT:    lw a6, 24(a2)
 ; RV32ZVE32F-NEXT:    lw a7, 32(a2)
-; RV32ZVE32F-NEXT:    lw t0, 24(a2)
-; RV32ZVE32F-NEXT:    lw t1, 16(a2)
-; RV32ZVE32F-NEXT:    lw t2, 8(a2)
+; RV32ZVE32F-NEXT:    lw t0, 40(a2)
+; RV32ZVE32F-NEXT:    lw t1, 48(a2)
+; RV32ZVE32F-NEXT:    lw t2, 56(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -6785,77 +6785,77 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    lw a2, 0(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB57_8
 ; RV32ZVE32F-NEXT:  .LBB57_2:
-; RV32ZVE32F-NEXT:    lw a4, 12(a3)
-; RV32ZVE32F-NEXT:    lw a5, 8(a3)
+; RV32ZVE32F-NEXT:    lw a4, 8(a3)
+; RV32ZVE32F-NEXT:    lw a5, 12(a3)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    bnez a6, .LBB57_9
 ; RV32ZVE32F-NEXT:  .LBB57_3:
-; RV32ZVE32F-NEXT:    lw a6, 20(a3)
-; RV32ZVE32F-NEXT:    lw a7, 16(a3)
+; RV32ZVE32F-NEXT:    lw a6, 16(a3)
+; RV32ZVE32F-NEXT:    lw a7, 20(a3)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB57_10
 ; RV32ZVE32F-NEXT:  .LBB57_4:
-; RV32ZVE32F-NEXT:    lw t1, 28(a3)
-; RV32ZVE32F-NEXT:    lw t2, 24(a3)
+; RV32ZVE32F-NEXT:    lw t1, 24(a3)
+; RV32ZVE32F-NEXT:    lw t2, 28(a3)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB57_11
 ; RV32ZVE32F-NEXT:  .LBB57_5:
-; RV32ZVE32F-NEXT:    lw t3, 36(a3)
-; RV32ZVE32F-NEXT:    lw t4, 32(a3)
+; RV32ZVE32F-NEXT:    lw t3, 32(a3)
+; RV32ZVE32F-NEXT:    lw t4, 36(a3)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB57_12
 ; RV32ZVE32F-NEXT:  .LBB57_6:
-; RV32ZVE32F-NEXT:    lw t5, 44(a3)
-; RV32ZVE32F-NEXT:    lw t6, 40(a3)
+; RV32ZVE32F-NEXT:    lw t5, 40(a3)
+; RV32ZVE32F-NEXT:    lw t6, 44(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_13
 ; RV32ZVE32F-NEXT:  .LBB57_7:
-; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    lw a2, 0(a3)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB57_2
 ; RV32ZVE32F-NEXT:  .LBB57_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
-; RV32ZVE32F-NEXT:    lw a4, 4(a5)
-; RV32ZVE32F-NEXT:    lw a5, 0(a5)
+; RV32ZVE32F-NEXT:    lw a4, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB57_3
 ; RV32ZVE32F-NEXT:  .LBB57_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
-; RV32ZVE32F-NEXT:    lw a6, 4(a7)
-; RV32ZVE32F-NEXT:    lw a7, 0(a7)
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB57_4
 ; RV32ZVE32F-NEXT:  .LBB57_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
-; RV32ZVE32F-NEXT:    lw t1, 4(t2)
-; RV32ZVE32F-NEXT:    lw t2, 0(t2)
+; RV32ZVE32F-NEXT:    lw t1, 0(t2)
+; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB57_5
 ; RV32ZVE32F-NEXT:  .LBB57_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
-; RV32ZVE32F-NEXT:    lw t3, 4(t4)
-; RV32ZVE32F-NEXT:    lw t4, 0(t4)
+; RV32ZVE32F-NEXT:    lw t3, 0(t4)
+; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB57_6
 ; RV32ZVE32F-NEXT:  .LBB57_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
-; RV32ZVE32F-NEXT:    lw t5, 4(t6)
-; RV32ZVE32F-NEXT:    lw t6, 0(t6)
+; RV32ZVE32F-NEXT:    lw t5, 0(t6)
+; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB57_13: # %else14
 ; RV32ZVE32F-NEXT:    addi sp, sp, -16
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 16
@@ -6869,42 +6869,42 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
-; RV32ZVE32F-NEXT:    lw s0, 4(s1)
-; RV32ZVE32F-NEXT:    lw s1, 0(s1)
+; RV32ZVE32F-NEXT:    lw s0, 0(s1)
+; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    bnez t0, .LBB57_17
 ; RV32ZVE32F-NEXT:  .LBB57_15:
-; RV32ZVE32F-NEXT:    lw t0, 60(a3)
-; RV32ZVE32F-NEXT:    lw a3, 56(a3)
+; RV32ZVE32F-NEXT:    lw t0, 56(a3)
+; RV32ZVE32F-NEXT:    lw a3, 60(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_18
 ; RV32ZVE32F-NEXT:  .LBB57_16:
-; RV32ZVE32F-NEXT:    lw s0, 52(a3)
-; RV32ZVE32F-NEXT:    lw s1, 48(a3)
+; RV32ZVE32F-NEXT:    lw s0, 48(a3)
+; RV32ZVE32F-NEXT:    lw s1, 52(a3)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB57_15
 ; RV32ZVE32F-NEXT:  .LBB57_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw t0, 4(a3)
-; RV32ZVE32F-NEXT:    lw a3, 0(a3)
+; RV32ZVE32F-NEXT:    lw t0, 0(a3)
+; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:  .LBB57_18: # %else20
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a5, 8(a0)
-; RV32ZVE32F-NEXT:    sw a4, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw a6, 20(a0)
-; RV32ZVE32F-NEXT:    sw t2, 24(a0)
-; RV32ZVE32F-NEXT:    sw t1, 28(a0)
-; RV32ZVE32F-NEXT:    sw t4, 32(a0)
-; RV32ZVE32F-NEXT:    sw t3, 36(a0)
-; RV32ZVE32F-NEXT:    sw t6, 40(a0)
-; RV32ZVE32F-NEXT:    sw t5, 44(a0)
-; RV32ZVE32F-NEXT:    sw s1, 48(a0)
-; RV32ZVE32F-NEXT:    sw s0, 52(a0)
-; RV32ZVE32F-NEXT:    sw a3, 56(a0)
-; RV32ZVE32F-NEXT:    sw t0, 60(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 8(a0)
+; RV32ZVE32F-NEXT:    sw a5, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
+; RV32ZVE32F-NEXT:    sw t1, 24(a0)
+; RV32ZVE32F-NEXT:    sw t2, 28(a0)
+; RV32ZVE32F-NEXT:    sw t3, 32(a0)
+; RV32ZVE32F-NEXT:    sw t4, 36(a0)
+; RV32ZVE32F-NEXT:    sw t5, 40(a0)
+; RV32ZVE32F-NEXT:    sw t6, 44(a0)
+; RV32ZVE32F-NEXT:    sw s0, 48(a0)
+; RV32ZVE32F-NEXT:    sw s1, 52(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
+; RV32ZVE32F-NEXT:    sw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    addi sp, sp, 16
@@ -11928,22 +11928,22 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8f64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a3, 56(a2)
-; RV32ZVE32F-NEXT:    lw a4, 48(a2)
-; RV32ZVE32F-NEXT:    lw a5, 40(a2)
+; RV32ZVE32F-NEXT:    lw a3, 8(a2)
+; RV32ZVE32F-NEXT:    lw a4, 16(a2)
+; RV32ZVE32F-NEXT:    lw a5, 24(a2)
 ; RV32ZVE32F-NEXT:    lw a6, 32(a2)
-; RV32ZVE32F-NEXT:    lw a7, 24(a2)
-; RV32ZVE32F-NEXT:    lw t0, 16(a2)
-; RV32ZVE32F-NEXT:    lw t1, 8(a2)
+; RV32ZVE32F-NEXT:    lw a7, 40(a2)
+; RV32ZVE32F-NEXT:    lw t0, 48(a2)
+; RV32ZVE32F-NEXT:    lw t1, 56(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index ecc81cbaa503d..856cbbfcc2795 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -275,9 +275,9 @@ define void @mscatter_v4i8(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -335,9 +335,9 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -407,13 +407,13 @@ define void @mscatter_v8i8(<8 x i8> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i8:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -822,9 +822,9 @@ define void @mscatter_v4i16(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -882,9 +882,9 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -954,13 +954,13 @@ define void @mscatter_v8i16(<8 x i16> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -1727,9 +1727,9 @@ define void @mscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -1787,9 +1787,9 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -1859,13 +1859,13 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8i32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -2947,8 +2947,8 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_v2i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 8(a0)
+; RV32ZVE32F-NEXT:    lw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV32ZVE32F-NEXT:    andi a4, a3, 1
@@ -3014,12 +3014,12 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 28(a0)
-; RV32ZVE32F-NEXT:    lw a2, 24(a0)
-; RV32ZVE32F-NEXT:    lw a3, 20(a0)
-; RV32ZVE32F-NEXT:    lw a4, 16(a0)
-; RV32ZVE32F-NEXT:    lw a7, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a6, 8(a0)
+; RV32ZVE32F-NEXT:    lw a7, 12(a0)
+; RV32ZVE32F-NEXT:    lw a3, 16(a0)
+; RV32ZVE32F-NEXT:    lw a4, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a2, 28(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v0
 ; RV32ZVE32F-NEXT:    andi t0, a5, 1
@@ -3056,38 +3056,38 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a5, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a5, .LBB38_4
 ; RV32ZVE32F-NEXT:  .LBB38_8: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 8(a1)
 ; RV64ZVE32F-NEXT:    ld a4, 16(a1)
-; RV64ZVE32F-NEXT:    ld a7, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 24(a1)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a3, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi t1, a6, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
+; RV64ZVE32F-NEXT:    andi t1, a7, 1
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB38_5
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_6
 ; RV64ZVE32F-NEXT:  .LBB38_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_7
 ; RV64ZVE32F-NEXT:  .LBB38_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB38_8
 ; RV64ZVE32F-NEXT:  .LBB38_4: # %else6
 ; RV64ZVE32F-NEXT:    ret
@@ -3095,15 +3095,15 @@ define void @mscatter_v4i64(<4 x i64> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_2
 ; RV64ZVE32F-NEXT:  .LBB38_6: # %cond.store1
-; RV64ZVE32F-NEXT:    sd t0, 0(a7)
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    sd t0, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_3
 ; RV64ZVE32F-NEXT:  .LBB38_7: # %cond.store3
 ; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB38_4
 ; RV64ZVE32F-NEXT:  .LBB38_8: # %cond.store5
 ; RV64ZVE32F-NEXT:    sd a3, 0(a2)
@@ -3127,12 +3127,12 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ;
 ; RV32ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a1, 28(a0)
-; RV32ZVE32F-NEXT:    lw a2, 24(a0)
-; RV32ZVE32F-NEXT:    lw a3, 20(a0)
-; RV32ZVE32F-NEXT:    lw a4, 16(a0)
-; RV32ZVE32F-NEXT:    lw a7, 12(a0)
 ; RV32ZVE32F-NEXT:    lw a6, 8(a0)
+; RV32ZVE32F-NEXT:    lw a7, 12(a0)
+; RV32ZVE32F-NEXT:    lw a3, 16(a0)
+; RV32ZVE32F-NEXT:    lw a4, 20(a0)
+; RV32ZVE32F-NEXT:    lw a1, 24(a0)
+; RV32ZVE32F-NEXT:    lw a2, 28(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmset.m v9
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
@@ -3169,38 +3169,38 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a5, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a5, .LBB39_4
 ; RV32ZVE32F-NEXT:  .LBB39_8: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 8(a1)
 ; RV64ZVE32F-NEXT:    ld a4, 16(a1)
-; RV64ZVE32F-NEXT:    ld a7, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 24(a1)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a5, 16(a0)
+; RV64ZVE32F-NEXT:    ld a3, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v8
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v8
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v8
 ; RV64ZVE32F-NEXT:    beqz zero, .LBB39_5
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_6
 ; RV64ZVE32F-NEXT:  .LBB39_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_7
 ; RV64ZVE32F-NEXT:  .LBB39_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB39_8
 ; RV64ZVE32F-NEXT:  .LBB39_4: # %else6
 ; RV64ZVE32F-NEXT:    ret
@@ -3208,15 +3208,15 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a6, 2
+; RV64ZVE32F-NEXT:    andi a0, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_2
 ; RV64ZVE32F-NEXT:  .LBB39_6: # %cond.store1
-; RV64ZVE32F-NEXT:    sd t0, 0(a7)
-; RV64ZVE32F-NEXT:    andi a0, a6, 4
+; RV64ZVE32F-NEXT:    sd t0, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_3
 ; RV64ZVE32F-NEXT:  .LBB39_7: # %cond.store3
 ; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a6, 8
+; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB39_4
 ; RV64ZVE32F-NEXT:  .LBB39_8: # %cond.store5
 ; RV64ZVE32F-NEXT:    sd a3, 0(a2)
@@ -3260,51 +3260,51 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a1, 60(a0)
-; RV32ZVE32F-NEXT:    lw a2, 56(a0)
-; RV32ZVE32F-NEXT:    lw a3, 52(a0)
-; RV32ZVE32F-NEXT:    lw a4, 48(a0)
-; RV32ZVE32F-NEXT:    lw a5, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a3, 48(a0)
+; RV32ZVE32F-NEXT:    lw a4, 52(a0)
+; RV32ZVE32F-NEXT:    lw a1, 56(a0)
+; RV32ZVE32F-NEXT:    lw a2, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV32ZVE32F-NEXT:    andi s1, a6, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v0
+; RV32ZVE32F-NEXT:    andi s1, a5, 1
 ; RV32ZVE32F-NEXT:    bnez s1, .LBB41_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a6, 2
+; RV32ZVE32F-NEXT:    andi a0, a5, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_11
 ; RV32ZVE32F-NEXT:  .LBB41_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a6, 4
+; RV32ZVE32F-NEXT:    andi a0, a5, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_12
 ; RV32ZVE32F-NEXT:  .LBB41_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a6, 8
+; RV32ZVE32F-NEXT:    andi a0, a5, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_13
 ; RV32ZVE32F-NEXT:  .LBB41_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a6, 16
+; RV32ZVE32F-NEXT:    andi a0, a5, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_14
 ; RV32ZVE32F-NEXT:  .LBB41_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a6, 32
+; RV32ZVE32F-NEXT:    andi a0, a5, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_15
 ; RV32ZVE32F-NEXT:  .LBB41_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a6, 64
+; RV32ZVE32F-NEXT:    andi a0, a5, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_16
 ; RV32ZVE32F-NEXT:  .LBB41_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a6, -128
+; RV32ZVE32F-NEXT:    andi a0, a5, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_9
 ; RV32ZVE32F-NEXT:  .LBB41_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB41_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3318,7 +3318,7 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
 ; RV32ZVE32F-NEXT:    sw s1, 4(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 0(s2)
-; RV32ZVE32F-NEXT:    andi a0, a6, 2
+; RV32ZVE32F-NEXT:    andi a0, a5, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_2
 ; RV32ZVE32F-NEXT:  .LBB41_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -3326,47 +3326,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
 ; RV32ZVE32F-NEXT:    sw s0, 4(a0)
 ; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 4
+; RV32ZVE32F-NEXT:    andi a0, a5, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_3
 ; RV32ZVE32F-NEXT:  .LBB41_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_4
 ; RV32ZVE32F-NEXT:  .LBB41_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 16
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_5
 ; RV32ZVE32F-NEXT:  .LBB41_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 32
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_6
 ; RV32ZVE32F-NEXT:  .LBB41_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, 64
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB41_7
 ; RV32ZVE32F-NEXT:  .LBB41_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, a6, -128
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a5, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB41_8
 ; RV32ZVE32F-NEXT:    j .LBB41_9
 ;
@@ -3380,47 +3380,47 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    .cfi_offset s0, -8
 ; RV64ZVE32F-NEXT:    .cfi_offset s1, -16
 ; RV64ZVE32F-NEXT:    .cfi_offset s2, -24
+; RV64ZVE32F-NEXT:    ld t5, 8(a1)
+; RV64ZVE32F-NEXT:    ld t3, 16(a1)
+; RV64ZVE32F-NEXT:    ld t1, 24(a1)
+; RV64ZVE32F-NEXT:    ld a6, 32(a1)
+; RV64ZVE32F-NEXT:    ld a4, 40(a1)
+; RV64ZVE32F-NEXT:    ld a3, 48(a1)
 ; RV64ZVE32F-NEXT:    ld a2, 56(a1)
-; RV64ZVE32F-NEXT:    ld a4, 48(a1)
-; RV64ZVE32F-NEXT:    ld a6, 40(a1)
-; RV64ZVE32F-NEXT:    ld t1, 32(a1)
-; RV64ZVE32F-NEXT:    ld t3, 24(a1)
-; RV64ZVE32F-NEXT:    ld t5, 16(a1)
-; RV64ZVE32F-NEXT:    ld s0, 8(a1)
-; RV64ZVE32F-NEXT:    ld a3, 56(a0)
-; RV64ZVE32F-NEXT:    ld a5, 48(a0)
-; RV64ZVE32F-NEXT:    ld t0, 40(a0)
-; RV64ZVE32F-NEXT:    ld t2, 32(a0)
-; RV64ZVE32F-NEXT:    ld t4, 24(a0)
-; RV64ZVE32F-NEXT:    ld t6, 16(a0)
 ; RV64ZVE32F-NEXT:    ld s1, 8(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
-; RV64ZVE32F-NEXT:    andi s2, a7, 1
+; RV64ZVE32F-NEXT:    ld s0, 16(a0)
+; RV64ZVE32F-NEXT:    ld t6, 24(a0)
+; RV64ZVE32F-NEXT:    ld t4, 32(a0)
+; RV64ZVE32F-NEXT:    ld t2, 40(a0)
+; RV64ZVE32F-NEXT:    ld a7, 48(a0)
+; RV64ZVE32F-NEXT:    ld a5, 56(a0)
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV64ZVE32F-NEXT:    andi s2, t0, 1
 ; RV64ZVE32F-NEXT:    bnez s2, .LBB41_10
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, t0, 2
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_11
 ; RV64ZVE32F-NEXT:  .LBB41_2: # %else2
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    andi a0, t0, 4
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_12
 ; RV64ZVE32F-NEXT:  .LBB41_3: # %else4
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    andi a0, t0, 8
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_13
 ; RV64ZVE32F-NEXT:  .LBB41_4: # %else6
-; RV64ZVE32F-NEXT:    andi a0, a7, 16
+; RV64ZVE32F-NEXT:    andi a0, t0, 16
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_14
 ; RV64ZVE32F-NEXT:  .LBB41_5: # %else8
-; RV64ZVE32F-NEXT:    andi a0, a7, 32
+; RV64ZVE32F-NEXT:    andi a0, t0, 32
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_15
 ; RV64ZVE32F-NEXT:  .LBB41_6: # %else10
-; RV64ZVE32F-NEXT:    andi a0, a7, 64
+; RV64ZVE32F-NEXT:    andi a0, t0, 64
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_16
 ; RV64ZVE32F-NEXT:  .LBB41_7: # %else12
-; RV64ZVE32F-NEXT:    andi a0, a7, -128
+; RV64ZVE32F-NEXT:    andi a0, t0, -128
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_9
 ; RV64ZVE32F-NEXT:  .LBB41_8: # %cond.store13
-; RV64ZVE32F-NEXT:    sd a3, 0(a2)
+; RV64ZVE32F-NEXT:    sd a5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB41_9: # %else14
 ; RV64ZVE32F-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
 ; RV64ZVE32F-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
@@ -3431,31 +3431,31 @@ define void @mscatter_v8i64(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ; RV64ZVE32F-NEXT:    ld a1, 0(a1)
 ; RV64ZVE32F-NEXT:    ld a0, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a0, 0(a1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 2
+; RV64ZVE32F-NEXT:    andi a0, t0, 2
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_2
 ; RV64ZVE32F-NEXT:  .LBB41_11: # %cond.store1
-; RV64ZVE32F-NEXT:    sd s1, 0(s0)
-; RV64ZVE32F-NEXT:    andi a0, a7, 4
+; RV64ZVE32F-NEXT:    sd s1, 0(t5)
+; RV64ZVE32F-NEXT:    andi a0, t0, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_3
 ; RV64ZVE32F-NEXT:  .LBB41_12: # %cond.store3
-; RV64ZVE32F-NEXT:    sd t6, 0(t5)
-; RV64ZVE32F-NEXT:    andi a0, a7, 8
+; RV64ZVE32F-NEXT:    sd s0, 0(t3)
+; RV64ZVE32F-NEXT:    andi a0, t0, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_4
 ; RV64ZVE32F-NEXT:  .LBB41_13: # %cond.store5
-; RV64ZVE32F-NEXT:    sd t4, 0(t3)
-; RV64ZVE32F-NEXT:    andi a0, a7, 16
+; RV64ZVE32F-NEXT:    sd t6, 0(t1)
+; RV64ZVE32F-NEXT:    andi a0, t0, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_5
 ; RV64ZVE32F-NEXT:  .LBB41_14: # %cond.store7
-; RV64ZVE32F-NEXT:    sd t2, 0(t1)
-; RV64ZVE32F-NEXT:    andi a0, a7, 32
+; RV64ZVE32F-NEXT:    sd t4, 0(a6)
+; RV64ZVE32F-NEXT:    andi a0, t0, 32
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_6
 ; RV64ZVE32F-NEXT:  .LBB41_15: # %cond.store9
-; RV64ZVE32F-NEXT:    sd t0, 0(a6)
-; RV64ZVE32F-NEXT:    andi a0, a7, 64
+; RV64ZVE32F-NEXT:    sd t2, 0(a4)
+; RV64ZVE32F-NEXT:    andi a0, t0, 64
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB41_7
 ; RV64ZVE32F-NEXT:  .LBB41_16: # %cond.store11
-; RV64ZVE32F-NEXT:    sd a5, 0(a4)
-; RV64ZVE32F-NEXT:    andi a0, a7, -128
+; RV64ZVE32F-NEXT:    sd a7, 0(a3)
+; RV64ZVE32F-NEXT:    andi a0, t0, -128
 ; RV64ZVE32F-NEXT:    bnez a0, .LBB41_8
 ; RV64ZVE32F-NEXT:    j .LBB41_9
   call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %val, <8 x ptr> %ptrs, i32 8, <8 x i1> %m)
@@ -3490,20 +3490,20 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3537,8 +3537,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB42_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3566,53 +3566,53 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_4
 ; RV32ZVE32F-NEXT:  .LBB42_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_5
 ; RV32ZVE32F-NEXT:  .LBB42_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_6
 ; RV32ZVE32F-NEXT:  .LBB42_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB42_7
 ; RV32ZVE32F-NEXT:  .LBB42_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB42_8
 ; RV32ZVE32F-NEXT:    j .LBB42_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -3734,20 +3734,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -3781,8 +3781,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB43_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -3810,53 +3810,53 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_4
 ; RV32ZVE32F-NEXT:  .LBB43_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_5
 ; RV32ZVE32F-NEXT:  .LBB43_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_6
 ; RV32ZVE32F-NEXT:  .LBB43_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB43_7
 ; RV32ZVE32F-NEXT:  .LBB43_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB43_8
 ; RV32ZVE32F-NEXT:    j .LBB43_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -3980,20 +3980,20 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4027,8 +4027,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB44_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4056,53 +4056,53 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_4
 ; RV32ZVE32F-NEXT:  .LBB44_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_5
 ; RV32ZVE32F-NEXT:  .LBB44_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_6
 ; RV32ZVE32F-NEXT:  .LBB44_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB44_7
 ; RV32ZVE32F-NEXT:  .LBB44_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB44_8
 ; RV32ZVE32F-NEXT:    j .LBB44_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -4233,20 +4233,20 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4280,8 +4280,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB45_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4309,53 +4309,53 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_4
 ; RV32ZVE32F-NEXT:  .LBB45_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_5
 ; RV32ZVE32F-NEXT:  .LBB45_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_6
 ; RV32ZVE32F-NEXT:  .LBB45_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_7
 ; RV32ZVE32F-NEXT:  .LBB45_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_8
 ; RV32ZVE32F-NEXT:    j .LBB45_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -4478,20 +4478,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4525,8 +4525,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB46_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4554,53 +4554,53 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_4
 ; RV32ZVE32F-NEXT:  .LBB46_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_5
 ; RV32ZVE32F-NEXT:  .LBB46_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_6
 ; RV32ZVE32F-NEXT:  .LBB46_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_7
 ; RV32ZVE32F-NEXT:  .LBB46_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_8
 ; RV32ZVE32F-NEXT:    j .LBB46_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -4725,20 +4725,20 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
@@ -4772,8 +4772,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB47_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -4801,53 +4801,53 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV32ZVE32F-NEXT:  .LBB47_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV32ZVE32F-NEXT:    j .LBB47_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a6, 40(a0)
-; RV64ZVE32F-NEXT:    ld a7, 32(a0)
-; RV64ZVE32F-NEXT:    ld t0, 24(a0)
-; RV64ZVE32F-NEXT:    ld t1, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t2, 8(a0)
+; RV64ZVE32F-NEXT:    ld t1, 16(a0)
+; RV64ZVE32F-NEXT:    ld t0, 24(a0)
+; RV64ZVE32F-NEXT:    ld a7, 32(a0)
+; RV64ZVE32F-NEXT:    ld a6, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    lui a4, 16
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a5, v0
@@ -4980,20 +4980,20 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5026,8 +5026,8 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB48_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5055,53 +5055,53 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_4
 ; RV32ZVE32F-NEXT:  .LBB48_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_5
 ; RV32ZVE32F-NEXT:  .LBB48_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_6
 ; RV32ZVE32F-NEXT:  .LBB48_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB48_7
 ; RV32ZVE32F-NEXT:  .LBB48_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB48_8
 ; RV32ZVE32F-NEXT:    j .LBB48_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -5223,20 +5223,20 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5269,8 +5269,8 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB49_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5298,53 +5298,53 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_4
 ; RV32ZVE32F-NEXT:  .LBB49_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_5
 ; RV32ZVE32F-NEXT:  .LBB49_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_6
 ; RV32ZVE32F-NEXT:  .LBB49_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB49_7
 ; RV32ZVE32F-NEXT:  .LBB49_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB49_8
 ; RV32ZVE32F-NEXT:    j .LBB49_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -5467,20 +5467,20 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
-; RV32ZVE32F-NEXT:    lw a2, 60(a0)
-; RV32ZVE32F-NEXT:    lw a3, 56(a0)
-; RV32ZVE32F-NEXT:    lw a4, 52(a0)
-; RV32ZVE32F-NEXT:    lw a5, 48(a0)
-; RV32ZVE32F-NEXT:    lw a6, 44(a0)
-; RV32ZVE32F-NEXT:    lw a7, 40(a0)
-; RV32ZVE32F-NEXT:    lw t0, 36(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 28(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 20(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw s0, 12(a0)
 ; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw a6, 40(a0)
+; RV32ZVE32F-NEXT:    lw a7, 44(a0)
+; RV32ZVE32F-NEXT:    lw a4, 48(a0)
+; RV32ZVE32F-NEXT:    lw a5, 52(a0)
+; RV32ZVE32F-NEXT:    lw a2, 56(a0)
+; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
@@ -5513,8 +5513,8 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB50_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -5542,53 +5542,53 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_4
 ; RV32ZVE32F-NEXT:  .LBB50_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_5
 ; RV32ZVE32F-NEXT:  .LBB50_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t0, 4(a0)
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_6
 ; RV32ZVE32F-NEXT:  .LBB50_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a7, 0(a0)
-; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a6, 0(a0)
+; RV32ZVE32F-NEXT:    sw a7, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB50_7
 ; RV32ZVE32F-NEXT:  .LBB50_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a5, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a4, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB50_8
 ; RV32ZVE32F-NEXT:    j .LBB50_9
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a2, 56(a0)
-; RV64ZVE32F-NEXT:    ld a3, 48(a0)
-; RV64ZVE32F-NEXT:    ld a5, 40(a0)
-; RV64ZVE32F-NEXT:    ld a6, 32(a0)
-; RV64ZVE32F-NEXT:    ld a7, 24(a0)
-; RV64ZVE32F-NEXT:    ld t0, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t0, 16(a0)
+; RV64ZVE32F-NEXT:    ld a7, 24(a0)
+; RV64ZVE32F-NEXT:    ld a6, 32(a0)
+; RV64ZVE32F-NEXT:    ld a5, 40(a0)
+; RV64ZVE32F-NEXT:    ld a3, 48(a0)
+; RV64ZVE32F-NEXT:    ld a2, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v0
 ; RV64ZVE32F-NEXT:    andi t2, a4, 1
@@ -5731,36 +5731,36 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
 ; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
-; RV32ZVE32F-NEXT:    lw a3, 60(a0)
-; RV32ZVE32F-NEXT:    lw a4, 56(a0)
-; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw a6, 48(a0)
-; RV32ZVE32F-NEXT:    lw a7, 44(a0)
-; RV32ZVE32F-NEXT:    lw t0, 40(a0)
-; RV32ZVE32F-NEXT:    lw t1, 36(a0)
-; RV32ZVE32F-NEXT:    lw t2, 32(a0)
-; RV32ZVE32F-NEXT:    lw t3, 28(a0)
-; RV32ZVE32F-NEXT:    lw t4, 24(a0)
-; RV32ZVE32F-NEXT:    lw t5, 20(a0)
-; RV32ZVE32F-NEXT:    lw t6, 16(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s2, 56(a2)
-; RV32ZVE32F-NEXT:    lw s3, 48(a2)
-; RV32ZVE32F-NEXT:    lw s4, 40(a2)
+; RV32ZVE32F-NEXT:    lw s1, 12(a0)
+; RV32ZVE32F-NEXT:    lw t5, 16(a0)
+; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t3, 24(a0)
+; RV32ZVE32F-NEXT:    lw t4, 28(a0)
+; RV32ZVE32F-NEXT:    lw t1, 32(a0)
+; RV32ZVE32F-NEXT:    lw t2, 36(a0)
+; RV32ZVE32F-NEXT:    lw a7, 40(a0)
+; RV32ZVE32F-NEXT:    lw t0, 44(a0)
+; RV32ZVE32F-NEXT:    lw a5, 48(a0)
+; RV32ZVE32F-NEXT:    lw a6, 52(a0)
+; RV32ZVE32F-NEXT:    lw a3, 56(a0)
+; RV32ZVE32F-NEXT:    lw a4, 60(a0)
+; RV32ZVE32F-NEXT:    lw s2, 8(a2)
+; RV32ZVE32F-NEXT:    lw s3, 16(a2)
+; RV32ZVE32F-NEXT:    lw s4, 24(a2)
 ; RV32ZVE32F-NEXT:    lw s5, 32(a2)
-; RV32ZVE32F-NEXT:    lw s6, 24(a2)
-; RV32ZVE32F-NEXT:    lw s7, 16(a2)
-; RV32ZVE32F-NEXT:    lw s8, 8(a2)
+; RV32ZVE32F-NEXT:    lw s6, 40(a2)
+; RV32ZVE32F-NEXT:    lw s7, 48(a2)
+; RV32ZVE32F-NEXT:    lw s8, 56(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -5792,8 +5792,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a4, 0(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
+; RV32ZVE32F-NEXT:    sw a4, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
 ; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
@@ -5827,40 +5827,40 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t6, 0(a0)
-; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    sw t5, 0(a0)
+; RV32ZVE32F-NEXT:    sw t6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t4, 0(a0)
-; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    sw t3, 0(a0)
+; RV32ZVE32F-NEXT:    sw t4, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t2, 0(a0)
-; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    sw t1, 0(a0)
+; RV32ZVE32F-NEXT:    sw t2, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t0, 0(a0)
-; RV32ZVE32F-NEXT:    sw a7, 4(a0)
+; RV32ZVE32F-NEXT:    sw a7, 0(a0)
+; RV32ZVE32F-NEXT:    sw t0, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB51_7
 ; RV32ZVE32F-NEXT:  .LBB51_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw a6, 0(a0)
-; RV32ZVE32F-NEXT:    sw a5, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
 ; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB51_8
 ; RV32ZVE32F-NEXT:    j .LBB51_9
@@ -5877,16 +5877,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:    .cfi_offset s1, -16
 ; RV64ZVE32F-NEXT:    .cfi_offset s2, -24
 ; RV64ZVE32F-NEXT:    .cfi_offset s3, -32
-; RV64ZVE32F-NEXT:    ld a3, 56(a0)
-; RV64ZVE32F-NEXT:    ld a4, 48(a0)
-; RV64ZVE32F-NEXT:    ld a6, 40(a0)
-; RV64ZVE32F-NEXT:    ld t1, 32(a0)
+; RV64ZVE32F-NEXT:    ld s0, 8(a0)
+; RV64ZVE32F-NEXT:    ld t5, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t3, 24(a0)
-; RV64ZVE32F-NEXT:    ld t6, 16(a0)
-; RV64ZVE32F-NEXT:    ld s1, 8(a0)
+; RV64ZVE32F-NEXT:    ld t1, 32(a0)
+; RV64ZVE32F-NEXT:    ld a6, 40(a0)
+; RV64ZVE32F-NEXT:    ld a4, 48(a0)
+; RV64ZVE32F-NEXT:    ld a3, 56(a0)
 ; RV64ZVE32F-NEXT:    ld s2, 8(a2)
-; RV64ZVE32F-NEXT:    ld s0, 16(a2)
-; RV64ZVE32F-NEXT:    ld t5, 24(a2)
+; RV64ZVE32F-NEXT:    ld s1, 16(a2)
+; RV64ZVE32F-NEXT:    ld t6, 24(a2)
 ; RV64ZVE32F-NEXT:    ld t4, 32(a2)
 ; RV64ZVE32F-NEXT:    ld t2, 40(a2)
 ; RV64ZVE32F-NEXT:    ld t0, 48(a2)
@@ -5938,19 +5938,19 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV64ZVE32F-NEXT:  .LBB51_11: # %cond.store1
 ; RV64ZVE32F-NEXT:    slli s2, s2, 3
 ; RV64ZVE32F-NEXT:    add s2, a1, s2
-; RV64ZVE32F-NEXT:    sd s1, 0(s2)
+; RV64ZVE32F-NEXT:    sd s0, 0(s2)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 4
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_3
 ; RV64ZVE32F-NEXT:  .LBB51_12: # %cond.store3
-; RV64ZVE32F-NEXT:    slli s0, s0, 3
-; RV64ZVE32F-NEXT:    add s0, a1, s0
-; RV64ZVE32F-NEXT:    sd t6, 0(s0)
+; RV64ZVE32F-NEXT:    slli s1, s1, 3
+; RV64ZVE32F-NEXT:    add s1, a1, s1
+; RV64ZVE32F-NEXT:    sd t5, 0(s1)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_4
 ; RV64ZVE32F-NEXT:  .LBB51_13: # %cond.store5
-; RV64ZVE32F-NEXT:    slli t5, t5, 3
-; RV64ZVE32F-NEXT:    add t5, a1, t5
-; RV64ZVE32F-NEXT:    sd t3, 0(t5)
+; RV64ZVE32F-NEXT:    slli t6, t6, 3
+; RV64ZVE32F-NEXT:    add t6, a1, t6
+; RV64ZVE32F-NEXT:    sd t3, 0(t6)
 ; RV64ZVE32F-NEXT:    andi a0, a7, 16
 ; RV64ZVE32F-NEXT:    beqz a0, .LBB51_5
 ; RV64ZVE32F-NEXT:  .LBB51_14: # %cond.store7
@@ -6075,9 +6075,9 @@ define void @mscatter_v4f16(<4 x half> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -6135,9 +6135,9 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -6207,13 +6207,13 @@ define void @mscatter_v8f16(<8 x half> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f16:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -6927,9 +6927,9 @@ define void @mscatter_v4f32(<4 x float> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -6987,9 +6987,9 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v9
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
@@ -7059,13 +7059,13 @@ define void @mscatter_v8f32(<8 x float> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f32:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -8283,9 +8283,9 @@ define void @mscatter_v4f64(<4 x double> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v4f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi a5, a3, 1
@@ -8380,9 +8380,9 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x ptr> %ptrs) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 24(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a4, 8(a0)
+; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a1, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmset.m v8
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
@@ -8529,13 +8529,13 @@ define void @mscatter_v8f64(<8 x double> %val, <8 x ptr> %ptrs, <8 x i1> %m) {
 ;
 ; RV64ZVE32F-LABEL: mscatter_v8f64:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    ld a1, 56(a0)
-; RV64ZVE32F-NEXT:    ld a2, 48(a0)
-; RV64ZVE32F-NEXT:    ld a4, 40(a0)
-; RV64ZVE32F-NEXT:    ld a5, 32(a0)
-; RV64ZVE32F-NEXT:    ld a6, 24(a0)
-; RV64ZVE32F-NEXT:    ld a7, 16(a0)
 ; RV64ZVE32F-NEXT:    ld t0, 8(a0)
+; RV64ZVE32F-NEXT:    ld a7, 16(a0)
+; RV64ZVE32F-NEXT:    ld a6, 24(a0)
+; RV64ZVE32F-NEXT:    ld a5, 32(a0)
+; RV64ZVE32F-NEXT:    ld a4, 40(a0)
+; RV64ZVE32F-NEXT:    ld a2, 48(a0)
+; RV64ZVE32F-NEXT:    ld a1, 56(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v0
 ; RV64ZVE32F-NEXT:    andi t1, a3, 1
@@ -10452,22 +10452,22 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8f64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 56(a1)
-; RV32ZVE32F-NEXT:    lw a3, 48(a1)
-; RV32ZVE32F-NEXT:    lw a4, 40(a1)
+; RV32ZVE32F-NEXT:    lw a2, 8(a1)
+; RV32ZVE32F-NEXT:    lw a3, 16(a1)
+; RV32ZVE32F-NEXT:    lw a4, 24(a1)
 ; RV32ZVE32F-NEXT:    lw a5, 32(a1)
-; RV32ZVE32F-NEXT:    lw a6, 24(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
-; RV32ZVE32F-NEXT:    lw t0, 8(a1)
+; RV32ZVE32F-NEXT:    lw a6, 40(a1)
+; RV32ZVE32F-NEXT:    lw a7, 48(a1)
+; RV32ZVE32F-NEXT:    lw t0, 56(a1)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a1), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 534f80a302229..e0ac4b623b5fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -753,18 +753,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    ld a6, 24(a1)
-; ZVE32F-NEXT:    ld a7, 16(a1)
-; ZVE32F-NEXT:    ld t0, 8(a1)
-; ZVE32F-NEXT:    ld t1, 0(a1)
+; ZVE32F-NEXT:    ld a6, 0(a1)
+; ZVE32F-NEXT:    ld a7, 8(a1)
+; ZVE32F-NEXT:    ld t0, 16(a1)
+; ZVE32F-NEXT:    ld t1, 24(a1)
 ; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    add t3, a0, t3
-; ZVE32F-NEXT:    sd t1, 0(t3)
-; ZVE32F-NEXT:    sd t0, 0(t2)
-; ZVE32F-NEXT:    sd a7, 80(t3)
-; ZVE32F-NEXT:    sd a6, 80(t2)
+; ZVE32F-NEXT:    sd a6, 0(t3)
+; ZVE32F-NEXT:    sd a7, 0(t2)
+; ZVE32F-NEXT:    sd t0, 80(t3)
+; ZVE32F-NEXT:    sd t1, 80(t2)
 ; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a3, a3, 4
 ; ZVE32F-NEXT:    addi a4, a4, -4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 352e3a2df1539..afcfc4889c68b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -366,10 +366,10 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -460,10 +460,10 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 24(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -483,7 +483,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -499,7 +499,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -557,17 +557,17 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -632,10 +632,10 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 24(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -655,7 +655,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -671,7 +671,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -726,17 +726,17 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -813,10 +813,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 24(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -836,7 +836,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -852,7 +852,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -1273,10 +1273,10 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    lhu s4, 24(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 56(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
@@ -1449,14 +1449,14 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
 ; CHECK-V-NEXT:    lhu s4, 0(a0)
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 24(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 24(a0)
-; CHECK-V-NEXT:    lhu s6, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -1472,7 +1472,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -1481,7 +1481,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -1593,33 +1593,33 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
-; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    lhu s4, 32(a1)
+; CHECK-NOV-NEXT:    lhu s5, 40(a1)
+; CHECK-NOV-NEXT:    lhu s6, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -1732,14 +1732,14 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
 ; CHECK-V-NEXT:    lhu s4, 0(a0)
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 24(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 24(a0)
-; CHECK-V-NEXT:    lhu s6, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -1755,7 +1755,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -1764,7 +1764,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -1872,33 +1872,33 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
-; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    lhu s4, 32(a1)
+; CHECK-NOV-NEXT:    lhu s5, 40(a1)
+; CHECK-NOV-NEXT:    lhu s6, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -2035,14 +2035,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
 ; CHECK-V-NEXT:    lhu s4, 0(a0)
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 24(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 24(a0)
-; CHECK-V-NEXT:    lhu s6, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -2058,7 +2058,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -2067,7 +2067,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -3702,10 +3702,10 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -3796,10 +3796,10 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 24(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -3819,7 +3819,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -3835,7 +3835,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -3891,17 +3891,17 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 24(a1)
-; CHECK-NOV-NEXT:    lhu s3, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
@@ -3966,10 +3966,10 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 24(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -3989,7 +3989,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -4005,7 +4005,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4060,10 +4060,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
 ; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
 ; CHECK-NOV-NEXT:    lhu s3, 24(a1)
-; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
@@ -4146,10 +4146,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 24(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -4169,7 +4169,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -4185,7 +4185,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4594,10 +4594,10 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    lhu s4, 24(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 56(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
@@ -4770,14 +4770,14 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
 ; CHECK-V-NEXT:    lhu s4, 0(a0)
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 24(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 24(a0)
-; CHECK-V-NEXT:    lhu s6, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -4793,7 +4793,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -4802,7 +4802,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -4912,33 +4912,33 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
 ; CHECK-NOV-NEXT:    lhu s1, 0(a1)
-; CHECK-NOV-NEXT:    lhu s2, 56(a1)
-; CHECK-NOV-NEXT:    lhu s3, 48(a1)
-; CHECK-NOV-NEXT:    lhu s4, 40(a1)
-; CHECK-NOV-NEXT:    lhu s5, 32(a1)
-; CHECK-NOV-NEXT:    lhu s6, 24(a1)
-; CHECK-NOV-NEXT:    lhu s7, 16(a1)
-; CHECK-NOV-NEXT:    lhu a1, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    lhu s4, 32(a1)
+; CHECK-NOV-NEXT:    lhu s5, 40(a1)
+; CHECK-NOV-NEXT:    lhu s6, 48(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
 ; CHECK-NOV-NEXT:    fcvt.lu.s s2, fs6, rtz
@@ -5049,14 +5049,14 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
 ; CHECK-V-NEXT:    lhu s4, 0(a0)
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 24(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 24(a0)
-; CHECK-V-NEXT:    lhu s6, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -5072,7 +5072,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -5081,7 +5081,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -5193,10 +5193,10 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    lhu s4, 24(a1)
 ; CHECK-NOV-NEXT:    lhu s5, 32(a1)
 ; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu a2, 48(a1)
 ; CHECK-NOV-NEXT:    lhu s7, 56(a1)
-; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
@@ -5351,14 +5351,14 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
 ; CHECK-V-NEXT:    lhu s4, 0(a0)
-; CHECK-V-NEXT:    lhu s0, 56(a0)
-; CHECK-V-NEXT:    lhu s1, 48(a0)
-; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu a1, 8(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 24(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 24(a0)
-; CHECK-V-NEXT:    lhu s6, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 8(a0)
-; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -5374,7 +5374,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -5383,7 +5383,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index bf4dbe7ee14ff..e0c58bc323085 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -10,19 +10,19 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    lhu s0, 0(a0)
-; CHECK-NEXT:    lhu s1, 6(a0)
-; CHECK-NEXT:    lhu s2, 4(a0)
-; CHECK-NEXT:    lhu a0, 2(a0)
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    lhu a1, 2(a0)
+; CHECK-NEXT:    lhu s1, 4(a0)
+; CHECK-NEXT:    lhu s2, 6(a0)
+; CHECK-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 8(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 0(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s2
+; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 12(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s1
+; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 4(sp)
 ; CHECK-NEXT:    addi a0, sp, 8
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 40adbbcd41fcd..2e41622291e0d 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -508,65 +508,65 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 20
 ; RV32I-NEXT:    sub a3, a3, a1
-; RV32I-NEXT:    lbu a1, 4(a3)
-; RV32I-NEXT:    lbu a4, 5(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    lbu a7, 8(a3)
-; RV32I-NEXT:    lbu t0, 9(a3)
-; RV32I-NEXT:    lbu t1, 10(a3)
-; RV32I-NEXT:    lbu t2, 11(a3)
-; RV32I-NEXT:    lbu t3, 12(a3)
-; RV32I-NEXT:    lbu t4, 13(a3)
-; RV32I-NEXT:    lbu t5, 14(a3)
-; RV32I-NEXT:    lbu t6, 15(a3)
+; RV32I-NEXT:    lbu a1, 0(a3)
+; RV32I-NEXT:    lbu a4, 1(a3)
+; RV32I-NEXT:    lbu a5, 2(a3)
+; RV32I-NEXT:    lbu a6, 3(a3)
+; RV32I-NEXT:    lbu a7, 4(a3)
+; RV32I-NEXT:    lbu t0, 5(a3)
+; RV32I-NEXT:    lbu t1, 6(a3)
+; RV32I-NEXT:    lbu t2, 7(a3)
+; RV32I-NEXT:    lbu t3, 8(a3)
+; RV32I-NEXT:    lbu t4, 9(a3)
+; RV32I-NEXT:    lbu t5, 10(a3)
+; RV32I-NEXT:    lbu t6, 11(a3)
+; RV32I-NEXT:    lbu s0, 12(a3)
+; RV32I-NEXT:    lbu s1, 13(a3)
+; RV32I-NEXT:    lbu s2, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    lbu s0, 0(a3)
-; RV32I-NEXT:    lbu s1, 1(a3)
-; RV32I-NEXT:    lbu s2, 2(a3)
-; RV32I-NEXT:    lbu a3, 3(a3)
 ; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    sll a4, a7, a2
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    sll a4, a1, a2
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, s2
-; RV32I-NEXT:    or a3, a3, s0
-; RV32I-NEXT:    srli a5, a3, 1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    xori a6, a2, 31
 ; RV32I-NEXT:    srl a5, a5, a6
 ; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    not a7, a2
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    sll a7, a5, a2
-; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a5, t4, t3
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
 ; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    not t0, a2
+; RV32I-NEXT:    srl a7, a7, t0
+; RV32I-NEXT:    sll t0, a5, a2
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a7, a7, a2
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    sll a3, a3, a2
 ; RV32I-NEXT:    srli a5, a5, 1
 ; RV32I-NEXT:    srl a5, a5, a6
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    sll a2, a3, a2
-; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a7, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
@@ -634,60 +634,60 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw t2, 0(a1)
-; RV32I-NEXT:    lw a3, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a2)
+; RV32I-NEXT:    lw t2, 0(a1)
 ; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    andi t1, a2, 64
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    mv a1, t2
+; RV32I-NEXT:    mv a4, t2
 ; RV32I-NEXT:    beqz t1, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t0, a3
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv t0, a1
+; RV32I-NEXT:    mv a4, a3
 ; RV32I-NEXT:  .LBB10_2:
 ; RV32I-NEXT:    andi a6, a2, 32
-; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    mv a5, a4
 ; RV32I-NEXT:    bnez a6, .LBB10_13
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    bnez t1, .LBB10_14
 ; RV32I-NEXT:  .LBB10_4:
 ; RV32I-NEXT:    beqz a6, .LBB10_6
 ; RV32I-NEXT:  .LBB10_5:
-; RV32I-NEXT:    mv t0, a4
+; RV32I-NEXT:    mv t0, a3
 ; RV32I-NEXT:  .LBB10_6:
 ; RV32I-NEXT:    slli t3, t0, 1
 ; RV32I-NEXT:    not t2, a2
 ; RV32I-NEXT:    beqz t1, .LBB10_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a3, a7
+; RV32I-NEXT:    mv a1, a7
 ; RV32I-NEXT:  .LBB10_8:
 ; RV32I-NEXT:    srl a7, a5, a2
 ; RV32I-NEXT:    sll t1, t3, t2
 ; RV32I-NEXT:    srl t0, t0, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB10_10:
 ; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    slli t1, a4, 1
+; RV32I-NEXT:    slli t1, a3, 1
 ; RV32I-NEXT:    sll t1, t1, t2
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    srl a4, a4, a2
+; RV32I-NEXT:    srl a3, a3, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    slli a1, a3, 1
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    srl a2, a3, a2
+; RV32I-NEXT:    slli a4, a1, 1
+; RV32I-NEXT:    sll a4, a4, t2
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srl a1, a1, a2
 ; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    sll a3, a5, t2
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    sw a2, 12(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sll a2, a5, t2
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
 ; RV32I-NEXT:    sw t0, 4(a0)
 ; RV32I-NEXT:    sw a7, 0(a0)
 ; RV32I-NEXT:    ret
@@ -695,7 +695,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    beqz t1, .LBB10_4
 ; RV32I-NEXT:  .LBB10_14:
-; RV32I-NEXT:    mv a4, t2
+; RV32I-NEXT:    mv a3, t2
 ; RV32I-NEXT:    bnez a6, .LBB10_5
 ; RV32I-NEXT:    j .LBB10_6
 ;
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index fcaa1f7f238f6..856a68d5f277a 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -312,12 +312,12 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    mv s0, a0
 ; RV32-NEXT:    lbu a1, 12(a0)
 ; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    lw a3, 4(s0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    slli a4, a1, 30
 ; RV32-NEXT:    srli s1, a2, 2
-; RV32-NEXT:    or s1, s1, a0
+; RV32-NEXT:    or s1, s1, a4
 ; RV32-NEXT:    slli a4, a2, 31
-; RV32-NEXT:    lw a0, 0(s0)
 ; RV32-NEXT:    srli a5, a3, 1
 ; RV32-NEXT:    or s2, a5, a4
 ; RV32-NEXT:    srli a1, a1, 2
@@ -391,8 +391,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lbu a0, 12(a0)
 ; RV64-NEXT:    lwu a1, 8(s0)
-; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    ld a2, 0(s0)
+; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    slli a0, a0, 29
 ; RV64-NEXT:    srai s1, a0, 31
@@ -464,12 +464,12 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    mv s0, a0
 ; RV32M-NEXT:    lbu a1, 12(a0)
 ; RV32M-NEXT:    lw a2, 8(a0)
-; RV32M-NEXT:    slli a0, a1, 30
-; RV32M-NEXT:    lw a3, 4(s0)
+; RV32M-NEXT:    lw a3, 4(a0)
+; RV32M-NEXT:    lw a0, 0(a0)
+; RV32M-NEXT:    slli a4, a1, 30
 ; RV32M-NEXT:    srli s1, a2, 2
-; RV32M-NEXT:    or s1, s1, a0
+; RV32M-NEXT:    or s1, s1, a4
 ; RV32M-NEXT:    slli a4, a2, 31
-; RV32M-NEXT:    lw a0, 0(s0)
 ; RV32M-NEXT:    srli a5, a3, 1
 ; RV32M-NEXT:    or s2, a5, a4
 ; RV32M-NEXT:    srli a1, a1, 2
@@ -614,12 +614,12 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    mv s0, a0
 ; RV32MV-NEXT:    lbu a1, 12(a0)
 ; RV32MV-NEXT:    lw a2, 8(a0)
-; RV32MV-NEXT:    slli a0, a1, 30
-; RV32MV-NEXT:    lw a3, 4(s0)
+; RV32MV-NEXT:    lw a3, 4(a0)
+; RV32MV-NEXT:    lw a0, 0(a0)
+; RV32MV-NEXT:    slli a4, a1, 30
 ; RV32MV-NEXT:    srli s1, a2, 2
-; RV32MV-NEXT:    or s1, s1, a0
+; RV32MV-NEXT:    or s1, s1, a4
 ; RV32MV-NEXT:    slli a4, a2, 31
-; RV32MV-NEXT:    lw a0, 0(s0)
 ; RV32MV-NEXT:    srli a5, a3, 1
 ; RV32MV-NEXT:    or s2, a5, a4
 ; RV32MV-NEXT:    srli a1, a1, 2
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 091b7d229a06c..ed10341580ef8 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -18,29 +18,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh s0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, -124
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, -1003
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -54,9 +54,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-LABEL: fold_srem_vec_1:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 12(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
 ; RV32IM-NEXT:    lh a4, 8(a1)
-; RV32IM-NEXT:    lh a1, 4(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
 ; RV32IM-NEXT:    mulh a5, a2, a5
@@ -69,14 +69,14 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 507375
 ; RV32IM-NEXT:    addi a5, a5, 1981
-; RV32IM-NEXT:    mulh a5, a1, a5
-; RV32IM-NEXT:    sub a5, a5, a1
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    sub a5, a5, a3
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -124
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    lui a5, 342392
 ; RV32IM-NEXT:    addi a5, a5, 669
 ; RV32IM-NEXT:    mulh a5, a4, a5
@@ -88,16 +88,16 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 780943
 ; RV32IM-NEXT:    addi a5, a5, 1809
-; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    mulh a5, a1, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 8
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    sh a4, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
@@ -110,29 +110,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh s0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, -124
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, -1003
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -145,52 +145,52 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lh a3, 0(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a5, 16(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a6
-; RV64IM-NEXT:    sub a3, a3, a1
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a6
+; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
 ; RV64IM-NEXT:    li a7, -124
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a5, a6
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 5
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a5, a6
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 5
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 7
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a5, a5, a2
+; RV64IM-NEXT:    mulh a2, a1, a6
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 7
+; RV64IM-NEXT:    add a2, a2, a6
 ; RV64IM-NEXT:    li a6, -1003
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
   ret <4 x i16> %1
@@ -206,29 +206,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
-; RV32I-NEXT:    lh s2, 4(a1)
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh s0, 4(a1)
+; RV32I-NEXT:    lh s1, 8(a1)
+; RV32I-NEXT:    lh s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -242,9 +242,9 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-LABEL: fold_srem_vec_2:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 12(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
 ; RV32IM-NEXT:    lh a4, 8(a1)
-; RV32IM-NEXT:    lh a1, 4(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
 ; RV32IM-NEXT:    mulh a6, a2, a5
@@ -255,13 +255,13 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a2, a2, a6
-; RV32IM-NEXT:    mulh a6, a1, a5
-; RV32IM-NEXT:    add a6, a6, a1
+; RV32IM-NEXT:    mulh a6, a3, a5
+; RV32IM-NEXT:    add a6, a6, a3
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a1, a1, a6
+; RV32IM-NEXT:    sub a3, a3, a6
 ; RV32IM-NEXT:    mulh a6, a4, a5
 ; RV32IM-NEXT:    add a6, a6, a4
 ; RV32IM-NEXT:    srli t0, a6, 31
@@ -269,16 +269,16 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a4, a4, a6
-; RV32IM-NEXT:    mulh a5, a3, a5
-; RV32IM-NEXT:    add a5, a5, a3
+; RV32IM-NEXT:    mulh a5, a1, a5
+; RV32IM-NEXT:    add a5, a5, a1
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    sh a4, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
@@ -291,29 +291,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
-; RV64I-NEXT:    lh s2, 8(a1)
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh s0, 8(a1)
+; RV64I-NEXT:    lh s1, 16(a1)
+; RV64I-NEXT:    lh s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -326,45 +326,45 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI1_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
+; RV64IM-NEXT:    lh a3, 0(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a5, 16(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a6, a2, a3
-; RV64IM-NEXT:    add a6, a6, a2
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a6, a3, a2
+; RV64IM-NEXT:    add a6, a6, a3
 ; RV64IM-NEXT:    srli a7, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, a7
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    mulh a6, a1, a3
-; RV64IM-NEXT:    add a6, a6, a1
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    mulh a6, a4, a2
+; RV64IM-NEXT:    add a6, a6, a4
 ; RV64IM-NEXT:    srli t0, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t0
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    mulh a6, a5, a3
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    mulh a6, a5, a2
 ; RV64IM-NEXT:    add a6, a6, a5
 ; RV64IM-NEXT:    srli t0, a6, 63
 ; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    add a6, a6, t0
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
-; RV64IM-NEXT:    mulh a3, a4, a3
-; RV64IM-NEXT:    add a3, a3, a4
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mulh a2, a1, a2
+; RV64IM-NEXT:    add a2, a2, a1
+; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 6
+; RV64IM-NEXT:    add a2, a2, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -624,21 +624,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lh a2, 0(a1)
+; RV32I-NEXT:    lh a3, 4(a1)
+; RV32I-NEXT:    lh a4, 8(a1)
 ; RV32I-NEXT:    lh a0, 12(a1)
-; RV32I-NEXT:    lh a3, 8(a1)
-; RV32I-NEXT:    lh a1, 4(a1)
-; RV32I-NEXT:    srli a4, a2, 26
-; RV32I-NEXT:    add a4, a2, a4
-; RV32I-NEXT:    andi a4, a4, -64
-; RV32I-NEXT:    sub s1, a2, a4
-; RV32I-NEXT:    srli a2, a1, 27
-; RV32I-NEXT:    add a2, a1, a2
-; RV32I-NEXT:    andi a2, a2, -32
-; RV32I-NEXT:    sub s2, a1, a2
-; RV32I-NEXT:    srli a1, a3, 29
+; RV32I-NEXT:    srli a1, a2, 26
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    andi a1, a1, -64
+; RV32I-NEXT:    sub s1, a2, a1
+; RV32I-NEXT:    srli a1, a3, 27
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    andi a1, a1, -32
+; RV32I-NEXT:    sub s2, a3, a1
+; RV32I-NEXT:    srli a1, a4, 29
+; RV32I-NEXT:    add a1, a4, a1
 ; RV32I-NEXT:    andi a1, a1, -8
-; RV32I-NEXT:    sub s3, a3, a1
+; RV32I-NEXT:    sub s3, a4, a1
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s0)
@@ -655,36 +655,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
 ; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 4(a1)
+; RV32IM-NEXT:    lh a4, 12(a1)
 ; RV32IM-NEXT:    lh a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    mulh a5, a4, a5
+; RV32IM-NEXT:    add a5, a5, a4
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    srli a5, a1, 26
 ; RV32IM-NEXT:    add a5, a1, a5
 ; RV32IM-NEXT:    andi a5, a5, -64
 ; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a4, 27
-; RV32IM-NEXT:    add a5, a4, a5
+; RV32IM-NEXT:    srli a5, a2, 27
+; RV32IM-NEXT:    add a5, a2, a5
 ; RV32IM-NEXT:    andi a5, a5, -32
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    srli a5, a3, 29
 ; RV32IM-NEXT:    add a5, a3, a5
 ; RV32IM-NEXT:    andi a5, a5, -8
 ; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_power_of_two:
@@ -697,21 +697,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lh a2, 0(a1)
+; RV64I-NEXT:    lh a3, 8(a1)
+; RV64I-NEXT:    lh a4, 16(a1)
 ; RV64I-NEXT:    lh a0, 24(a1)
-; RV64I-NEXT:    lh a3, 16(a1)
-; RV64I-NEXT:    lh a1, 8(a1)
-; RV64I-NEXT:    srli a4, a2, 58
-; RV64I-NEXT:    add a4, a2, a4
-; RV64I-NEXT:    andi a4, a4, -64
-; RV64I-NEXT:    subw s1, a2, a4
-; RV64I-NEXT:    srli a2, a1, 59
-; RV64I-NEXT:    add a2, a1, a2
-; RV64I-NEXT:    andi a2, a2, -32
-; RV64I-NEXT:    subw s2, a1, a2
-; RV64I-NEXT:    srli a1, a3, 61
+; RV64I-NEXT:    srli a1, a2, 58
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    andi a1, a1, -64
+; RV64I-NEXT:    subw s1, a2, a1
+; RV64I-NEXT:    srli a1, a3, 59
 ; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    andi a1, a1, -32
+; RV64I-NEXT:    subw s2, a3, a1
+; RV64I-NEXT:    srli a1, a4, 61
+; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    andi a1, a1, -8
-; RV64I-NEXT:    subw s3, a3, a1
+; RV64I-NEXT:    subw s3, a4, a1
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s0)
@@ -773,24 +773,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh s0, 12(a1)
-; RV32I-NEXT:    lh s1, 8(a1)
 ; RV32I-NEXT:    lh a2, 4(a1)
+; RV32I-NEXT:    lh s0, 8(a1)
+; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    li a1, 654
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __modsi3 at plt
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __modsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s2)
-; RV32I-NEXT:    sh s1, 4(s2)
+; RV32I-NEXT:    sh s0, 4(s2)
 ; RV32I-NEXT:    sh s3, 2(s2)
 ; RV32I-NEXT:    sh zero, 0(s2)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -804,8 +804,8 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32IM-LABEL: dont_fold_srem_one:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    lh a2, 4(a1)
-; RV32IM-NEXT:    lh a3, 12(a1)
-; RV32IM-NEXT:    lh a1, 8(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 820904
 ; RV32IM-NEXT:    addi a4, a4, -1903
 ; RV32IM-NEXT:    mulh a4, a2, a4
@@ -818,27 +818,27 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 729444
 ; RV32IM-NEXT:    addi a4, a4, 713
-; RV32IM-NEXT:    mulh a4, a1, a4
-; RV32IM-NEXT:    add a4, a4, a1
+; RV32IM-NEXT:    mulh a4, a3, a4
+; RV32IM-NEXT:    add a4, a4, a3
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 4
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    lui a4, 395996
 ; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a3, a4
+; RV32IM-NEXT:    mulh a4, a1, a4
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 11
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a3, 6(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a3, 4(a0)
 ; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
@@ -850,24 +850,24 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh s0, 24(a1)
-; RV64I-NEXT:    lh s1, 16(a1)
 ; RV64I-NEXT:    lh a2, 8(a1)
+; RV64I-NEXT:    lh s0, 16(a1)
+; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s2)
-; RV64I-NEXT:    sh s1, 4(s2)
+; RV64I-NEXT:    sh s0, 4(s2)
 ; RV64I-NEXT:    sh s3, 2(s2)
 ; RV64I-NEXT:    sh zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -880,42 +880,42 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
-; RV64IM-NEXT:    lh a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lh a3, 16(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 8
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 8
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a4, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -933,8 +933,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lh a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    lh a0, 8(a1)
+; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    srli a1, a2, 17
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    lui a3, 8
@@ -1005,8 +1005,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lh a2, 8(a1)
 ; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    lh a0, 16(a1)
+; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    srli a1, a2, 49
 ; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    lui a3, 8
@@ -1033,38 +1033,38 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i16_smax:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI5_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI5_0)(a3)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI5_0)(a2)
+; RV64IM-NEXT:    lh a3, 16(a1)
 ; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srli a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    srli a3, a4, 49
-; RV64IM-NEXT:    add a3, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    srli a2, a4, 49
+; RV64IM-NEXT:    add a2, a4, a2
 ; RV64IM-NEXT:    lui a5, 8
-; RV64IM-NEXT:    and a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    and a2, a2, a5
+; RV64IM-NEXT:    subw a4, a4, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a4, 2(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -1085,47 +1085,48 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 28(a1)
-; RV32I-NEXT:    lw s1, 24(a1)
-; RV32I-NEXT:    lw s2, 20(a1)
-; RV32I-NEXT:    lw s3, 16(a1)
-; RV32I-NEXT:    lw s4, 12(a1)
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw s5, 8(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw s0, 8(a1)
+; RV32I-NEXT:    lw s1, 12(a1)
+; RV32I-NEXT:    lw s2, 16(a1)
+; RV32I-NEXT:    lw s3, 20(a1)
+; RV32I-NEXT:    lw s4, 24(a1)
+; RV32I-NEXT:    lw s5, 28(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s5
-; RV32I-NEXT:    mv a1, s4
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
-; RV32I-NEXT:    mv s4, a0
-; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s3
-; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    mv a0, s4
+; RV32I-NEXT:    mv a1, s5
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
 ; RV32I-NEXT:    sw a0, 24(s6)
 ; RV32I-NEXT:    sw s3, 20(s6)
 ; RV32I-NEXT:    sw s2, 16(s6)
-; RV32I-NEXT:    sw s5, 12(s6)
-; RV32I-NEXT:    sw s4, 8(s6)
+; RV32I-NEXT:    sw s1, 12(s6)
+; RV32I-NEXT:    sw s0, 8(s6)
 ; RV32I-NEXT:    sw s8, 4(s6)
 ; RV32I-NEXT:    sw s7, 0(s6)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -1154,47 +1155,48 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 28(a1)
-; RV32IM-NEXT:    lw s1, 24(a1)
-; RV32IM-NEXT:    lw s2, 20(a1)
-; RV32IM-NEXT:    lw s3, 16(a1)
-; RV32IM-NEXT:    lw s4, 12(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw s5, 8(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw s0, 8(a1)
+; RV32IM-NEXT:    lw s1, 12(a1)
+; RV32IM-NEXT:    lw s2, 16(a1)
+; RV32IM-NEXT:    lw s3, 20(a1)
+; RV32IM-NEXT:    lw s4, 24(a1)
+; RV32IM-NEXT:    lw s5, 28(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, a3
+; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s5
-; RV32IM-NEXT:    mv a1, s4
+; RV32IM-NEXT:    mv a0, s0
+; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
-; RV32IM-NEXT:    mv s4, a0
-; RV32IM-NEXT:    mv s5, a1
+; RV32IM-NEXT:    mv s0, a0
+; RV32IM-NEXT:    mv s1, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s3
-; RV32IM-NEXT:    mv a1, s2
+; RV32IM-NEXT:    mv a0, s2
+; RV32IM-NEXT:    mv a1, s3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s2, a0
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s1
-; RV32IM-NEXT:    mv a1, s0
+; RV32IM-NEXT:    mv a0, s4
+; RV32IM-NEXT:    mv a1, s5
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
 ; RV32IM-NEXT:    sw a0, 24(s6)
 ; RV32IM-NEXT:    sw s3, 20(s6)
 ; RV32IM-NEXT:    sw s2, 16(s6)
-; RV32IM-NEXT:    sw s5, 12(s6)
-; RV32IM-NEXT:    sw s4, 8(s6)
+; RV32IM-NEXT:    sw s1, 12(s6)
+; RV32IM-NEXT:    sw s0, 8(s6)
 ; RV32IM-NEXT:    sw s8, 4(s6)
 ; RV32IM-NEXT:    sw s7, 0(s6)
 ; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -1218,24 +1220,24 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 24(a1)
-; RV64I-NEXT:    ld s1, 16(a1)
 ; RV64I-NEXT:    ld a2, 8(a1)
+; RV64I-NEXT:    ld s0, 16(a1)
+; RV64I-NEXT:    ld s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __moddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __moddi3 at plt
 ; RV64I-NEXT:    sd a0, 24(s2)
-; RV64I-NEXT:    sd s1, 16(s2)
+; RV64I-NEXT:    sd s0, 16(s2)
 ; RV64I-NEXT:    sd s3, 8(s2)
 ; RV64I-NEXT:    sd zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -1248,42 +1250,42 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    ld a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64IM-NEXT:    ld a4, 24(a1)
-; RV64IM-NEXT:    ld a1, 8(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 4
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 8(a1)
+; RV64IM-NEXT:    ld a1, 24(a1)
+; RV64IM-NEXT:    mulh a2, a3, a2
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 4
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_1)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a1, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 8
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a3, a3, a2
+; RV64IM-NEXT:    mulh a2, a4, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 8
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a1, a1, a3
-; RV64IM-NEXT:    mulh a3, a4, a5
-; RV64IM-NEXT:    srli a5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 11
-; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a2
+; RV64IM-NEXT:    mulh a2, a1, a5
+; RV64IM-NEXT:    srli a5, a2, 63
+; RV64IM-NEXT:    srai a2, a2, 11
+; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 24(a0)
-; RV64IM-NEXT:    sd a1, 8(a0)
-; RV64IM-NEXT:    sd a2, 16(a0)
+; RV64IM-NEXT:    sd a1, 24(a0)
+; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 1525804be545c..1a9126acac8da 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,30 +10,30 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 0(a2)
-; RISCV32-NEXT:    lw a4, 12(a1)
-; RISCV32-NEXT:    lw a5, 8(a1)
+; RISCV32-NEXT:    lw a3, 0(a1)
 ; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw a1, 0(a1)
-; RISCV32-NEXT:    lw a6, 12(a2)
-; RISCV32-NEXT:    lw a7, 8(a2)
+; RISCV32-NEXT:    lw a4, 8(a1)
+; RISCV32-NEXT:    lw a5, 12(a1)
+; RISCV32-NEXT:    lw a1, 0(a2)
 ; RISCV32-NEXT:    lw t0, 4(a2)
-; RISCV32-NEXT:    mulhu a2, a1, a3
-; RISCV32-NEXT:    mul t1, t2, a3
+; RISCV32-NEXT:    lw a6, 8(a2)
+; RISCV32-NEXT:    lw a7, 12(a2)
+; RISCV32-NEXT:    mulhu a2, a3, a1
+; RISCV32-NEXT:    mul t1, t2, a1
 ; RISCV32-NEXT:    add a2, t1, a2
 ; RISCV32-NEXT:    sltu t1, a2, t1
-; RISCV32-NEXT:    mulhu t3, t2, a3
+; RISCV32-NEXT:    mulhu t3, t2, a1
 ; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a1, t0
+; RISCV32-NEXT:    mul t1, a3, t0
 ; RISCV32-NEXT:    add a2, t1, a2
 ; RISCV32-NEXT:    sltu t1, a2, t1
-; RISCV32-NEXT:    mulhu t3, a1, t0
+; RISCV32-NEXT:    mulhu t3, a3, t0
 ; RISCV32-NEXT:    add t1, t3, t1
 ; RISCV32-NEXT:    add t5, t4, t1
 ; RISCV32-NEXT:    mul t6, t2, t0
 ; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, a7, a1
-; RISCV32-NEXT:    mul s3, a5, a3
+; RISCV32-NEXT:    mul t1, a6, a3
+; RISCV32-NEXT:    mul s3, a4, a1
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t3, t1, s0
@@ -42,15 +42,15 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    mulhu t5, t2, t0
 ; RISCV32-NEXT:    add t4, t5, t4
 ; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, a7
-; RISCV32-NEXT:    mul t5, a6, a1
+; RISCV32-NEXT:    mul t4, t2, a6
+; RISCV32-NEXT:    mul t5, a7, a3
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, a7, a1
+; RISCV32-NEXT:    mulhu s1, a6, a3
 ; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, t0, a5
-; RISCV32-NEXT:    mul t5, a4, a3
+; RISCV32-NEXT:    mul t4, t0, a4
+; RISCV32-NEXT:    mul t5, a5, a1
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a5, a3
+; RISCV32-NEXT:    mulhu t5, a4, a1
 ; RISCV32-NEXT:    add t6, t5, t4
 ; RISCV32-NEXT:    add t4, t6, s2
 ; RISCV32-NEXT:    sltu s3, s4, s3
@@ -63,35 +63,35 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:  .LBB0_2: # %start
 ; RISCV32-NEXT:    sltu s0, s2, s1
 ; RISCV32-NEXT:    snez s1, t2
-; RISCV32-NEXT:    snez s2, a6
+; RISCV32-NEXT:    snez s2, a7
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a6, a1
+; RISCV32-NEXT:    mulhu s2, a7, a3
 ; RISCV32-NEXT:    snez s2, s2
 ; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, a7
+; RISCV32-NEXT:    mulhu t2, t2, a6
 ; RISCV32-NEXT:    snez t2, t2
 ; RISCV32-NEXT:    or t2, s1, t2
 ; RISCV32-NEXT:    or t2, t2, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
 ; RISCV32-NEXT:    snez t6, t0
-; RISCV32-NEXT:    snez s0, a4
+; RISCV32-NEXT:    snez s0, a5
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a4, a3
+; RISCV32-NEXT:    mulhu s0, a5, a1
 ; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu t0, t0, a5
+; RISCV32-NEXT:    mulhu t0, t0, a4
 ; RISCV32-NEXT:    snez t0, t0
 ; RISCV32-NEXT:    or t0, t6, t0
 ; RISCV32-NEXT:    or t0, t0, t5
-; RISCV32-NEXT:    or a6, a7, a6
+; RISCV32-NEXT:    or a6, a6, a7
 ; RISCV32-NEXT:    snez a6, a6
-; RISCV32-NEXT:    or a4, a5, a4
+; RISCV32-NEXT:    or a4, a4, a5
 ; RISCV32-NEXT:    snez a4, a4
 ; RISCV32-NEXT:    and a4, a4, a6
 ; RISCV32-NEXT:    or a4, a4, t0
 ; RISCV32-NEXT:    or a5, t2, t3
 ; RISCV32-NEXT:    or a4, a4, a5
-; RISCV32-NEXT:    mul a1, a1, a3
+; RISCV32-NEXT:    mul a1, a3, a1
 ; RISCV32-NEXT:    andi a4, a4, 1
 ; RISCV32-NEXT:    sw a1, 0(a0)
 ; RISCV32-NEXT:    sw a2, 4(a0)
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 8fc4465ffab1f..765a3a1ee801c 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -19,29 +19,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu a2, 0(a1)
+; RV32I-NEXT:    lhu s0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 124
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 98
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 1003
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -55,9 +55,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-LABEL: fold_urem_vec_1:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    lhu a2, 0(a1)
-; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
 ; RV32IM-NEXT:    lhu a4, 8(a1)
-; RV32IM-NEXT:    lhu a1, 4(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
 ; RV32IM-NEXT:    mulhu a5, a2, a5
@@ -66,10 +66,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 8456
 ; RV32IM-NEXT:    addi a5, a5, 1058
-; RV32IM-NEXT:    mulhu a5, a1, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    li a6, 124
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    lui a5, 10700
 ; RV32IM-NEXT:    addi a5, a5, -1003
 ; RV32IM-NEXT:    mulhu a5, a4, a5
@@ -78,13 +78,13 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1045
 ; RV32IM-NEXT:    addi a5, a5, 1801
-; RV32IM-NEXT:    mulhu a5, a3, a5
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    li a6, 1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    sh a4, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
@@ -97,29 +97,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu a2, 0(a1)
+; RV64I-NEXT:    lhu s0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 124
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 98
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 1003
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -132,38 +132,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lhu a3, 0(a1)
+; RV64IM-NEXT:    lhu a4, 8(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulhu a3, a1, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulhu a2, a4, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
 ; RV64IM-NEXT:    li a7, 124
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a5, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a5, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulhu a3, a4, a6
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a5, a5, a2
+; RV64IM-NEXT:    mulhu a2, a1, a6
 ; RV64IM-NEXT:    li a6, 1003
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
   ret <4 x i16> %1
@@ -179,29 +179,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
-; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu a2, 0(a1)
+; RV32I-NEXT:    lhu s0, 4(a1)
+; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s2, 12(a1)
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    li a1, 95
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s3)
 ; RV32I-NEXT:    sh s1, 4(s3)
-; RV32I-NEXT:    sh s2, 2(s3)
+; RV32I-NEXT:    sh s0, 2(s3)
 ; RV32I-NEXT:    sh s4, 0(s3)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -215,27 +215,27 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-LABEL: fold_urem_vec_2:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    lhu a2, 0(a1)
-; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
 ; RV32IM-NEXT:    lhu a4, 8(a1)
-; RV32IM-NEXT:    lhu a1, 4(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
 ; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a2, a2, a6
-; RV32IM-NEXT:    mulhu a6, a1, a5
+; RV32IM-NEXT:    mulhu a6, a3, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a1, a1, a6
+; RV32IM-NEXT:    sub a3, a3, a6
 ; RV32IM-NEXT:    mulhu a6, a4, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a4, a4, a6
-; RV32IM-NEXT:    mulhu a5, a3, a5
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    sh a4, 4(a0)
-; RV32IM-NEXT:    sh a1, 2(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
@@ -248,29 +248,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
-; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu a2, 0(a1)
+; RV64I-NEXT:    lhu s0, 8(a1)
+; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s2, 24(a1)
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s4, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    li a1, 95
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s3)
 ; RV64I-NEXT:    sh s1, 4(s3)
-; RV64I-NEXT:    sh s2, 2(s3)
+; RV64I-NEXT:    sh s0, 2(s3)
 ; RV64I-NEXT:    sh s4, 0(s3)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
@@ -283,29 +283,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 0(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI1_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI1_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
+; RV64IM-NEXT:    lui a2, %hi(.LCPI1_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI1_0)(a2)
+; RV64IM-NEXT:    lhu a3, 0(a1)
+; RV64IM-NEXT:    lhu a4, 8(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
-; RV64IM-NEXT:    mulhu a6, a2, a3
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a6, a3, a2
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    mulhu a6, a1, a3
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    mulhu a6, a4, a2
 ; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    mulhu a6, a5, a3
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    mulhu a6, a5, a2
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
-; RV64IM-NEXT:    mulhu a3, a4, a3
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    mulhu a2, a1, a2
+; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -531,19 +531,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu a2, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
+; RV32I-NEXT:    lhu s1, 0(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
-; RV32I-NEXT:    lhu s3, 0(a1)
+; RV32I-NEXT:    lhu s3, 8(a1)
+; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    andi a1, s3, 63
+; RV32I-NEXT:    andi a1, s1, 63
 ; RV32I-NEXT:    andi a2, s2, 31
-; RV32I-NEXT:    andi s1, s1, 7
+; RV32I-NEXT:    andi a3, s3, 7
 ; RV32I-NEXT:    sh a0, 6(s0)
-; RV32I-NEXT:    sh s1, 4(s0)
+; RV32I-NEXT:    sh a3, 4(s0)
 ; RV32I-NEXT:    sh a2, 2(s0)
 ; RV32I-NEXT:    sh a1, 0(s0)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -556,23 +556,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
 ; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 4(a1)
+; RV32IM-NEXT:    lhu a4, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a4, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a4, a4, 31
+; RV32IM-NEXT:    andi a2, a2, 31
 ; RV32IM-NEXT:    andi a3, a3, 7
 ; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_power_of_two:
@@ -583,19 +583,19 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu a2, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
+; RV64I-NEXT:    lhu s1, 0(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
-; RV64I-NEXT:    lhu s3, 0(a1)
+; RV64I-NEXT:    lhu s3, 16(a1)
+; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    andi a1, s3, 63
+; RV64I-NEXT:    andi a1, s1, 63
 ; RV64I-NEXT:    andi a2, s2, 31
-; RV64I-NEXT:    andi s1, s1, 7
+; RV64I-NEXT:    andi a3, s3, 7
 ; RV64I-NEXT:    sh a0, 6(s0)
-; RV64I-NEXT:    sh s1, 4(s0)
+; RV64I-NEXT:    sh a3, 4(s0)
 ; RV64I-NEXT:    sh a2, 2(s0)
 ; RV64I-NEXT:    sh a1, 0(s0)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -640,24 +640,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 12(a1)
-; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu a2, 4(a1)
+; RV32I-NEXT:    lhu s0, 8(a1)
+; RV32I-NEXT:    lhu s1, 12(a1)
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    li a1, 654
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    li a1, 23
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __umodsi3 at plt
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a1, a0, 1327
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __umodsi3 at plt
 ; RV32I-NEXT:    sh a0, 6(s2)
-; RV32I-NEXT:    sh s1, 4(s2)
+; RV32I-NEXT:    sh s0, 4(s2)
 ; RV32I-NEXT:    sh s3, 2(s2)
 ; RV32I-NEXT:    sh zero, 0(s2)
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -671,8 +671,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32IM-LABEL: dont_fold_urem_one:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    lhu a2, 4(a1)
-; RV32IM-NEXT:    lhu a3, 12(a1)
-; RV32IM-NEXT:    lhu a1, 8(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 1603
 ; RV32IM-NEXT:    addi a4, a4, 1341
 ; RV32IM-NEXT:    mulhu a4, a2, a4
@@ -681,20 +681,20 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 45590
 ; RV32IM-NEXT:    addi a4, a4, 1069
-; RV32IM-NEXT:    mulhu a4, a1, a4
+; RV32IM-NEXT:    mulhu a4, a3, a4
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    lui a4, 193
 ; RV32IM-NEXT:    addi a4, a4, 1464
-; RV32IM-NEXT:    mulhu a4, a3, a4
+; RV32IM-NEXT:    mulhu a4, a1, a4
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a3, 6(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
+; RV32IM-NEXT:    sh a3, 4(a0)
 ; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
@@ -706,24 +706,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 24(a1)
-; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu a2, 8(a1)
+; RV64I-NEXT:    lhu s0, 16(a1)
+; RV64I-NEXT:    lhu s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sh a0, 6(s2)
-; RV64I-NEXT:    sh s1, 4(s2)
+; RV64I-NEXT:    sh s0, 4(s2)
 ; RV64I-NEXT:    sh s3, 2(s2)
 ; RV64I-NEXT:    sh zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -736,32 +736,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 8(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
-; RV64IM-NEXT:    lhu a4, 24(a1)
-; RV64IM-NEXT:    lhu a1, 16(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
+; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lhu a3, 8(a1)
+; RV64IM-NEXT:    lhu a4, 16(a1)
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulhu a3, a1, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a2
+; RV64IM-NEXT:    mulhu a2, a4, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a4, a5
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a1, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a2, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -791,47 +791,48 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 28(a1)
-; RV32I-NEXT:    lw s1, 24(a1)
-; RV32I-NEXT:    lw s2, 20(a1)
-; RV32I-NEXT:    lw s3, 16(a1)
-; RV32I-NEXT:    lw s4, 12(a1)
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw s5, 8(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw s0, 8(a1)
+; RV32I-NEXT:    lw s1, 12(a1)
+; RV32I-NEXT:    lw s2, 16(a1)
+; RV32I-NEXT:    lw s3, 20(a1)
+; RV32I-NEXT:    lw s4, 24(a1)
+; RV32I-NEXT:    lw s5, 28(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s5
-; RV32I-NEXT:    mv a1, s4
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
-; RV32I-NEXT:    mv s4, a0
-; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s3
-; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    mv a0, s4
+; RV32I-NEXT:    mv a1, s5
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
 ; RV32I-NEXT:    sw a0, 24(s6)
 ; RV32I-NEXT:    sw s3, 20(s6)
 ; RV32I-NEXT:    sw s2, 16(s6)
-; RV32I-NEXT:    sw s5, 12(s6)
-; RV32I-NEXT:    sw s4, 8(s6)
+; RV32I-NEXT:    sw s1, 12(s6)
+; RV32I-NEXT:    sw s0, 8(s6)
 ; RV32I-NEXT:    sw s8, 4(s6)
 ; RV32I-NEXT:    sw s7, 0(s6)
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -860,47 +861,48 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 28(a1)
-; RV32IM-NEXT:    lw s1, 24(a1)
-; RV32IM-NEXT:    lw s2, 20(a1)
-; RV32IM-NEXT:    lw s3, 16(a1)
-; RV32IM-NEXT:    lw s4, 12(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw s5, 8(a1)
-; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw s0, 8(a1)
+; RV32IM-NEXT:    lw s1, 12(a1)
+; RV32IM-NEXT:    lw s2, 16(a1)
+; RV32IM-NEXT:    lw s3, 20(a1)
+; RV32IM-NEXT:    lw s4, 24(a1)
+; RV32IM-NEXT:    lw s5, 28(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
 ; RV32IM-NEXT:    mv a0, a3
+; RV32IM-NEXT:    mv a1, a4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s5
-; RV32IM-NEXT:    mv a1, s4
+; RV32IM-NEXT:    mv a0, s0
+; RV32IM-NEXT:    mv a1, s1
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
-; RV32IM-NEXT:    mv s4, a0
-; RV32IM-NEXT:    mv s5, a1
+; RV32IM-NEXT:    mv s0, a0
+; RV32IM-NEXT:    mv s1, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s3
-; RV32IM-NEXT:    mv a1, s2
+; RV32IM-NEXT:    mv a0, s2
+; RV32IM-NEXT:    mv a1, s3
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s2, a0
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s1
-; RV32IM-NEXT:    mv a1, s0
+; RV32IM-NEXT:    mv a0, s4
+; RV32IM-NEXT:    mv a1, s5
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
 ; RV32IM-NEXT:    sw a0, 24(s6)
 ; RV32IM-NEXT:    sw s3, 20(s6)
 ; RV32IM-NEXT:    sw s2, 16(s6)
-; RV32IM-NEXT:    sw s5, 12(s6)
-; RV32IM-NEXT:    sw s4, 8(s6)
+; RV32IM-NEXT:    sw s1, 12(s6)
+; RV32IM-NEXT:    sw s0, 8(s6)
 ; RV32IM-NEXT:    sw s8, 4(s6)
 ; RV32IM-NEXT:    sw s7, 0(s6)
 ; RV32IM-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
@@ -924,24 +926,24 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    ld s0, 24(a1)
-; RV64I-NEXT:    ld s1, 16(a1)
 ; RV64I-NEXT:    ld a2, 8(a1)
+; RV64I-NEXT:    ld s0, 16(a1)
+; RV64I-NEXT:    ld s1, 24(a1)
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    li a1, 654
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    mv s3, a0
 ; RV64I-NEXT:    li a1, 23
-; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __umoddi3 at plt
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    addiw a1, a0, 1327
-; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    call __umoddi3 at plt
 ; RV64I-NEXT:    sd a0, 24(s2)
-; RV64I-NEXT:    sd s1, 16(s2)
+; RV64I-NEXT:    sd s0, 16(s2)
 ; RV64I-NEXT:    sd s3, 8(s2)
 ; RV64I-NEXT:    sd zero, 0(s2)
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -954,39 +956,39 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    ld a2, 16(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64IM-NEXT:    ld a4, 24(a1)
-; RV64IM-NEXT:    ld a1, 8(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    sub a5, a2, a3
+; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 8(a1)
+; RV64IM-NEXT:    ld a1, 24(a1)
+; RV64IM-NEXT:    mulhu a2, a3, a2
+; RV64IM-NEXT:    sub a5, a3, a2
 ; RV64IM-NEXT:    srli a5, a5, 1
-; RV64IM-NEXT:    add a3, a5, a3
-; RV64IM-NEXT:    srli a3, a3, 4
+; RV64IM-NEXT:    add a2, a5, a2
+; RV64IM-NEXT:    srli a2, a2, 4
 ; RV64IM-NEXT:    li a5, 23
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI6_1)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI6_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 1
-; RV64IM-NEXT:    mulhu a3, a3, a6
-; RV64IM-NEXT:    srli a3, a3, 7
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a3, a3, a2
+; RV64IM-NEXT:    srli a2, a4, 1
+; RV64IM-NEXT:    mulhu a2, a2, a6
+; RV64IM-NEXT:    srli a2, a2, 7
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    sub a1, a1, a3
-; RV64IM-NEXT:    mulhu a3, a4, a5
-; RV64IM-NEXT:    srli a3, a3, 12
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a2
+; RV64IM-NEXT:    mulhu a2, a1, a5
+; RV64IM-NEXT:    srli a2, a2, 12
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    sub a4, a4, a3
+; RV64IM-NEXT:    mul a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 24(a0)
-; RV64IM-NEXT:    sd a1, 8(a0)
-; RV64IM-NEXT:    sd a2, 16(a0)
+; RV64IM-NEXT:    sd a1, 24(a0)
+; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 59aa1d9ae2893..c5c06d00a66a3 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -498,11 +498,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 20
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 27
-; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-FPELIM-NEXT:    addi a1, sp, 35
-; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-FPELIM-NEXT:    andi a1, a0, -8
+; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 35
+; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a1)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 48
 ; ILP32-ILP32F-FPELIM-NEXT:    ret
 ;
@@ -522,11 +522,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 4
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-WITHFP-NEXT:    addi a1, s0, 19
-; ILP32-ILP32F-WITHFP-NEXT:    sw a1, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-WITHFP-NEXT:    andi a1, a0, -8
+; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 19
+; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a1)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    addi sp, sp, 48
@@ -545,11 +545,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 20
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 27
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -8
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, sp, 35
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a1, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a1, a0, -8
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 35
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a1)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a1)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index ed5a522a8a746..c712421f16acb 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -219,30 +219,30 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a7, 3(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    lbu a3, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a7, a6
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a3, a3, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -355,30 +355,30 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a7, 3(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    lbu a3, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a7, a6
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a3, a3, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -491,30 +491,30 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a7, 3(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    lbu a3, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a3, a7, a6
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t1
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a3, a3, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sra a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -628,30 +628,30 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a5, t1, t0
-; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a4, a4, 35
-; RV64I-NEXT:    or a5, a4, a1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    srl a1, a3, a5
 ; RV64I-NEXT:    bltz a4, .LBB6_2
@@ -779,38 +779,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 15(a0)
-; RV32I-NEXT:    lbu a3, 14(a0)
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 11(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 9(a0)
-; RV32I-NEXT:    lbu t1, 8(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 5(a0)
-; RV32I-NEXT:    lbu t5, 4(a0)
-; RV32I-NEXT:    lbu t6, 3(a0)
-; RV32I-NEXT:    lbu s0, 2(a0)
-; RV32I-NEXT:    lbu s1, 1(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    sb a7, 10(a2)
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    sb t1, 8(a2)
-; RV32I-NEXT:    sb t0, 9(a2)
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb s0, 2(a2)
-; RV32I-NEXT:    sb t6, 3(a2)
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb s1, 1(a2)
-; RV32I-NEXT:    sb t3, 6(a2)
-; RV32I-NEXT:    sb t2, 7(a2)
-; RV32I-NEXT:    sb t5, 4(a2)
-; RV32I-NEXT:    sb t4, 5(a2)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
+; RV32I-NEXT:    lbu t6, 12(a0)
+; RV32I-NEXT:    lbu s0, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb t0, 6(a2)
+; RV32I-NEXT:    sb t1, 7(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -847,30 +847,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    lbu t2, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or a5, t1, t0
-; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a4, a4, 35
-; RV64I-NEXT:    or a5, a4, a1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    sll a1, a3, a5
 ; RV64I-NEXT:    bltz a4, .LBB7_2
@@ -998,38 +998,38 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 20
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 15(a0)
-; RV32I-NEXT:    lbu a3, 14(a0)
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 11(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 9(a0)
-; RV32I-NEXT:    lbu t1, 8(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 5(a0)
-; RV32I-NEXT:    lbu t5, 4(a0)
-; RV32I-NEXT:    lbu t6, 3(a0)
-; RV32I-NEXT:    lbu s0, 2(a0)
-; RV32I-NEXT:    lbu s1, 1(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    sb a7, 10(a2)
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    sb t1, 8(a2)
-; RV32I-NEXT:    sb t0, 9(a2)
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb s0, 2(a2)
-; RV32I-NEXT:    sb t6, 3(a2)
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb s1, 1(a2)
-; RV32I-NEXT:    sb t3, 6(a2)
-; RV32I-NEXT:    sb t2, 7(a2)
-; RV32I-NEXT:    sb t5, 4(a2)
-; RV32I-NEXT:    sb t4, 5(a2)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
+; RV32I-NEXT:    lbu t6, 12(a0)
+; RV32I-NEXT:    lbu s0, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb t0, 6(a2)
+; RV32I-NEXT:    sb t1, 7(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1066,30 +1066,30 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    lbu t2, 4(a1)
+; RV64I-NEXT:    lbu t3, 5(a1)
 ; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu t0, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 0(a1)
-; RV64I-NEXT:    lbu t2, 1(a1)
-; RV64I-NEXT:    lbu t3, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or t2, t3, t2
 ; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t3
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a5, a5, 35
-; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a6, t1, t0
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    or a5, a1, a5
 ; RV64I-NEXT:    addi a6, a5, -64
 ; RV64I-NEXT:    sra a1, a3, a5
 ; RV64I-NEXT:    bltz a6, .LBB8_2
@@ -1223,38 +1223,38 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 15(a0)
-; RV32I-NEXT:    lbu a3, 14(a0)
-; RV32I-NEXT:    lbu a4, 13(a0)
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    lbu a6, 11(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 9(a0)
-; RV32I-NEXT:    lbu t1, 8(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 5(a0)
-; RV32I-NEXT:    lbu t5, 4(a0)
-; RV32I-NEXT:    lbu t6, 3(a0)
-; RV32I-NEXT:    lbu s0, 2(a0)
-; RV32I-NEXT:    lbu s1, 1(a0)
-; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    sb a7, 10(a2)
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    sb t1, 8(a2)
-; RV32I-NEXT:    sb t0, 9(a2)
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb s0, 2(a2)
-; RV32I-NEXT:    sb t6, 3(a2)
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb s1, 1(a2)
-; RV32I-NEXT:    sb t3, 6(a2)
-; RV32I-NEXT:    sb t2, 7(a2)
-; RV32I-NEXT:    sb t5, 4(a2)
-; RV32I-NEXT:    sb t4, 5(a2)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu t1, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
+; RV32I-NEXT:    lbu t6, 12(a0)
+; RV32I-NEXT:    lbu s0, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb t0, 6(a2)
+; RV32I-NEXT:    sb t1, 7(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1398,81 +1398,81 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 56(sp)
 ; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    lbu t5, 25(a0)
-; RV64I-NEXT:    lbu t4, 26(a0)
-; RV64I-NEXT:    lbu t3, 27(a0)
-; RV64I-NEXT:    lbu t2, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 10(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 11(a0)
-; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 12(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 13(a0)
-; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t6, 14(a0)
-; RV64I-NEXT:    lbu s0, 15(a0)
-; RV64I-NEXT:    lbu s1, 16(a0)
-; RV64I-NEXT:    lbu s2, 17(a0)
-; RV64I-NEXT:    lbu s3, 18(a0)
-; RV64I-NEXT:    lbu s4, 19(a0)
-; RV64I-NEXT:    lbu s5, 20(a0)
-; RV64I-NEXT:    lbu s6, 21(a0)
-; RV64I-NEXT:    lbu s7, 22(a0)
-; RV64I-NEXT:    lbu s8, 24(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
-; RV64I-NEXT:    lbu s10, 0(a0)
-; RV64I-NEXT:    lbu s11, 1(a0)
-; RV64I-NEXT:    lbu ra, 2(a0)
-; RV64I-NEXT:    lbu a5, 3(a0)
-; RV64I-NEXT:    lbu a4, 4(a0)
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    lbu a1, 6(a0)
-; RV64I-NEXT:    lbu a6, 8(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    sb s9, 23(a2)
-; RV64I-NEXT:    sb s7, 22(a2)
-; RV64I-NEXT:    sb s6, 21(a2)
-; RV64I-NEXT:    sb s5, 20(a2)
-; RV64I-NEXT:    sb s4, 19(a2)
-; RV64I-NEXT:    sb s3, 18(a2)
-; RV64I-NEXT:    sb s2, 17(a2)
-; RV64I-NEXT:    sb s1, 16(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb t1, 29(a2)
-; RV64I-NEXT:    sb t2, 28(a2)
-; RV64I-NEXT:    sb t3, 27(a2)
-; RV64I-NEXT:    sb t4, 26(a2)
-; RV64I-NEXT:    sb t5, 25(a2)
-; RV64I-NEXT:    sb s8, 24(a2)
-; RV64I-NEXT:    sb a0, 7(a2)
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    sb a4, 4(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    sb ra, 2(a2)
-; RV64I-NEXT:    sb s11, 1(a2)
-; RV64I-NEXT:    sb s10, 0(a2)
-; RV64I-NEXT:    sb s0, 15(a2)
-; RV64I-NEXT:    sb t6, 14(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 13(a2)
+; RV64I-NEXT:    add t1, a0, a5
+; RV64I-NEXT:    lbu a0, 0(t1)
+; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 1(t1)
+; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a6, 2(t1)
+; RV64I-NEXT:    lbu t0, 3(t1)
+; RV64I-NEXT:    lbu t2, 4(t1)
+; RV64I-NEXT:    lbu t3, 5(t1)
+; RV64I-NEXT:    lbu t4, 6(t1)
+; RV64I-NEXT:    lbu t5, 7(t1)
+; RV64I-NEXT:    lbu a0, 8(t1)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(t1)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(t1)
+; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 11(t1)
+; RV64I-NEXT:    lbu t6, 12(t1)
+; RV64I-NEXT:    lbu s0, 13(t1)
+; RV64I-NEXT:    lbu s1, 14(t1)
+; RV64I-NEXT:    lbu s2, 15(t1)
+; RV64I-NEXT:    lbu s3, 16(t1)
+; RV64I-NEXT:    lbu s4, 17(t1)
+; RV64I-NEXT:    lbu s5, 18(t1)
+; RV64I-NEXT:    lbu s6, 19(t1)
+; RV64I-NEXT:    lbu s7, 20(t1)
+; RV64I-NEXT:    lbu s8, 21(t1)
+; RV64I-NEXT:    lbu s9, 22(t1)
+; RV64I-NEXT:    lbu s10, 23(t1)
+; RV64I-NEXT:    lbu s11, 24(t1)
+; RV64I-NEXT:    lbu ra, 25(t1)
+; RV64I-NEXT:    lbu a5, 26(t1)
+; RV64I-NEXT:    lbu a4, 27(t1)
+; RV64I-NEXT:    lbu a3, 28(t1)
+; RV64I-NEXT:    lbu a1, 29(t1)
+; RV64I-NEXT:    lbu a0, 30(t1)
+; RV64I-NEXT:    lbu t1, 31(t1)
+; RV64I-NEXT:    sb s10, 23(a2)
+; RV64I-NEXT:    sb s9, 22(a2)
+; RV64I-NEXT:    sb s8, 21(a2)
+; RV64I-NEXT:    sb s7, 20(a2)
+; RV64I-NEXT:    sb s6, 19(a2)
+; RV64I-NEXT:    sb s5, 18(a2)
+; RV64I-NEXT:    sb s4, 17(a2)
+; RV64I-NEXT:    sb s3, 16(a2)
+; RV64I-NEXT:    sb t1, 31(a2)
+; RV64I-NEXT:    sb a0, 30(a2)
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb ra, 25(a2)
+; RV64I-NEXT:    sb s11, 24(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 4(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 1(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    sb s2, 15(a2)
+; RV64I-NEXT:    sb s1, 14(a2)
+; RV64I-NEXT:    sb s0, 13(a2)
+; RV64I-NEXT:    sb t6, 12(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    sb a6, 8(a2)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1615,83 +1615,83 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a0, a5, 31
-; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s0, 29(a0)
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a1, 23(a0)
-; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t0, 24(a0)
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t2, 25(a0)
-; RV32I-NEXT:    lbu s8, 22(a0)
-; RV32I-NEXT:    lbu t4, 21(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 19(a0)
-; RV32I-NEXT:    lbu a1, 15(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 16(a0)
-; RV32I-NEXT:    lbu s2, 18(a0)
-; RV32I-NEXT:    lbu s3, 17(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu s6, 12(a0)
-; RV32I-NEXT:    lbu s7, 11(a0)
-; RV32I-NEXT:    lbu a1, 7(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s9, 8(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 9(a0)
-; RV32I-NEXT:    lbu ra, 6(a0)
-; RV32I-NEXT:    lbu a7, 5(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    sb s0, 29(a2)
-; RV32I-NEXT:    sb t3, 28(a2)
-; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    sb s3, 17(a2)
-; RV32I-NEXT:    sb s1, 16(a2)
-; RV32I-NEXT:    sb t6, 19(a2)
-; RV32I-NEXT:    sb s2, 18(a2)
-; RV32I-NEXT:    sb t4, 21(a2)
-; RV32I-NEXT:    sb t5, 20(a2)
-; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    sb s8, 22(a2)
-; RV32I-NEXT:    sb s11, 9(a2)
-; RV32I-NEXT:    sb s9, 8(a2)
-; RV32I-NEXT:    sb s7, 11(a2)
-; RV32I-NEXT:    sb s10, 10(a2)
-; RV32I-NEXT:    sb s5, 13(a2)
-; RV32I-NEXT:    sb s6, 12(a2)
-; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 15(a2)
-; RV32I-NEXT:    sb s4, 14(a2)
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    andi a5, a5, 31
+; RV32I-NEXT:    addi a0, sp, 28
+; RV32I-NEXT:    add t1, a0, a5
+; RV32I-NEXT:    lbu a0, 0(t1)
+; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 1(t1)
+; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 2(t1)
+; RV32I-NEXT:    lbu t0, 3(t1)
+; RV32I-NEXT:    lbu a0, 4(t1)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 5(t1)
+; RV32I-NEXT:    lbu a0, 6(t1)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(t1)
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t2, 8(t1)
+; RV32I-NEXT:    lbu t3, 9(t1)
+; RV32I-NEXT:    lbu t4, 10(t1)
+; RV32I-NEXT:    lbu t5, 11(t1)
+; RV32I-NEXT:    lbu t6, 12(t1)
+; RV32I-NEXT:    lbu s0, 13(t1)
+; RV32I-NEXT:    lbu s1, 14(t1)
+; RV32I-NEXT:    lbu s2, 15(t1)
+; RV32I-NEXT:    lbu s3, 16(t1)
+; RV32I-NEXT:    lbu s4, 17(t1)
+; RV32I-NEXT:    lbu s5, 18(t1)
+; RV32I-NEXT:    lbu s6, 19(t1)
+; RV32I-NEXT:    lbu s7, 20(t1)
+; RV32I-NEXT:    lbu s8, 21(t1)
+; RV32I-NEXT:    lbu s9, 22(t1)
+; RV32I-NEXT:    lbu s10, 23(t1)
+; RV32I-NEXT:    lbu s11, 24(t1)
+; RV32I-NEXT:    lbu ra, 25(t1)
+; RV32I-NEXT:    lbu a4, 26(t1)
+; RV32I-NEXT:    lbu a0, 27(t1)
+; RV32I-NEXT:    lbu a3, 28(t1)
+; RV32I-NEXT:    lbu a1, 29(t1)
+; RV32I-NEXT:    lbu a5, 30(t1)
+; RV32I-NEXT:    lbu t1, 31(t1)
+; RV32I-NEXT:    sb ra, 25(a2)
+; RV32I-NEXT:    sb s11, 24(a2)
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    sb a1, 29(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    sb s4, 17(a2)
+; RV32I-NEXT:    sb s3, 16(a2)
+; RV32I-NEXT:    sb s6, 19(a2)
+; RV32I-NEXT:    sb s5, 18(a2)
+; RV32I-NEXT:    sb s8, 21(a2)
+; RV32I-NEXT:    sb s7, 20(a2)
+; RV32I-NEXT:    sb s10, 23(a2)
+; RV32I-NEXT:    sb s9, 22(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s2, 15(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 1(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a6, 5(a2)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    sb ra, 6(a2)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
@@ -1843,81 +1843,81 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 88(sp)
 ; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a0, a0, a5
-; RV64I-NEXT:    lbu t5, 25(a0)
-; RV64I-NEXT:    lbu t4, 26(a0)
-; RV64I-NEXT:    lbu t3, 27(a0)
-; RV64I-NEXT:    lbu t2, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 10(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 11(a0)
-; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 12(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 13(a0)
-; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t6, 14(a0)
-; RV64I-NEXT:    lbu s0, 15(a0)
-; RV64I-NEXT:    lbu s1, 16(a0)
-; RV64I-NEXT:    lbu s2, 17(a0)
-; RV64I-NEXT:    lbu s3, 18(a0)
-; RV64I-NEXT:    lbu s4, 19(a0)
-; RV64I-NEXT:    lbu s5, 20(a0)
-; RV64I-NEXT:    lbu s6, 21(a0)
-; RV64I-NEXT:    lbu s7, 22(a0)
-; RV64I-NEXT:    lbu s8, 24(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
-; RV64I-NEXT:    lbu s10, 0(a0)
-; RV64I-NEXT:    lbu s11, 1(a0)
-; RV64I-NEXT:    lbu ra, 2(a0)
-; RV64I-NEXT:    lbu a5, 3(a0)
-; RV64I-NEXT:    lbu a4, 4(a0)
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    lbu a1, 6(a0)
-; RV64I-NEXT:    lbu a6, 8(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    sb s9, 23(a2)
-; RV64I-NEXT:    sb s7, 22(a2)
-; RV64I-NEXT:    sb s6, 21(a2)
-; RV64I-NEXT:    sb s5, 20(a2)
-; RV64I-NEXT:    sb s4, 19(a2)
-; RV64I-NEXT:    sb s3, 18(a2)
-; RV64I-NEXT:    sb s2, 17(a2)
-; RV64I-NEXT:    sb s1, 16(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb t1, 29(a2)
-; RV64I-NEXT:    sb t2, 28(a2)
-; RV64I-NEXT:    sb t3, 27(a2)
-; RV64I-NEXT:    sb t4, 26(a2)
-; RV64I-NEXT:    sb t5, 25(a2)
-; RV64I-NEXT:    sb s8, 24(a2)
-; RV64I-NEXT:    sb a0, 7(a2)
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    sb a4, 4(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    sb ra, 2(a2)
-; RV64I-NEXT:    sb s11, 1(a2)
-; RV64I-NEXT:    sb s10, 0(a2)
-; RV64I-NEXT:    sb s0, 15(a2)
-; RV64I-NEXT:    sb t6, 14(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 13(a2)
+; RV64I-NEXT:    sub t1, a0, a5
+; RV64I-NEXT:    lbu a0, 0(t1)
+; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 1(t1)
+; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a6, 2(t1)
+; RV64I-NEXT:    lbu t0, 3(t1)
+; RV64I-NEXT:    lbu t2, 4(t1)
+; RV64I-NEXT:    lbu t3, 5(t1)
+; RV64I-NEXT:    lbu t4, 6(t1)
+; RV64I-NEXT:    lbu t5, 7(t1)
+; RV64I-NEXT:    lbu a0, 8(t1)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(t1)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(t1)
+; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 11(t1)
+; RV64I-NEXT:    lbu t6, 12(t1)
+; RV64I-NEXT:    lbu s0, 13(t1)
+; RV64I-NEXT:    lbu s1, 14(t1)
+; RV64I-NEXT:    lbu s2, 15(t1)
+; RV64I-NEXT:    lbu s3, 16(t1)
+; RV64I-NEXT:    lbu s4, 17(t1)
+; RV64I-NEXT:    lbu s5, 18(t1)
+; RV64I-NEXT:    lbu s6, 19(t1)
+; RV64I-NEXT:    lbu s7, 20(t1)
+; RV64I-NEXT:    lbu s8, 21(t1)
+; RV64I-NEXT:    lbu s9, 22(t1)
+; RV64I-NEXT:    lbu s10, 23(t1)
+; RV64I-NEXT:    lbu s11, 24(t1)
+; RV64I-NEXT:    lbu ra, 25(t1)
+; RV64I-NEXT:    lbu a5, 26(t1)
+; RV64I-NEXT:    lbu a4, 27(t1)
+; RV64I-NEXT:    lbu a3, 28(t1)
+; RV64I-NEXT:    lbu a1, 29(t1)
+; RV64I-NEXT:    lbu a0, 30(t1)
+; RV64I-NEXT:    lbu t1, 31(t1)
+; RV64I-NEXT:    sb s10, 23(a2)
+; RV64I-NEXT:    sb s9, 22(a2)
+; RV64I-NEXT:    sb s8, 21(a2)
+; RV64I-NEXT:    sb s7, 20(a2)
+; RV64I-NEXT:    sb s6, 19(a2)
+; RV64I-NEXT:    sb s5, 18(a2)
+; RV64I-NEXT:    sb s4, 17(a2)
+; RV64I-NEXT:    sb s3, 16(a2)
+; RV64I-NEXT:    sb t1, 31(a2)
+; RV64I-NEXT:    sb a0, 30(a2)
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb ra, 25(a2)
+; RV64I-NEXT:    sb s11, 24(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 4(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 1(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    sb s2, 15(a2)
+; RV64I-NEXT:    sb s1, 14(a2)
+; RV64I-NEXT:    sb s0, 13(a2)
+; RV64I-NEXT:    sb t6, 12(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    sb a6, 8(a2)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -2062,81 +2062,81 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 60(sp)
 ; RV32I-NEXT:    andi a5, a5, 31
 ; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a0, a0, a5
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s0, 29(a0)
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a1, 23(a0)
-; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t0, 24(a0)
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t2, 25(a0)
-; RV32I-NEXT:    lbu s8, 22(a0)
-; RV32I-NEXT:    lbu t4, 21(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 19(a0)
-; RV32I-NEXT:    lbu a1, 15(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 16(a0)
-; RV32I-NEXT:    lbu s2, 18(a0)
-; RV32I-NEXT:    lbu s3, 17(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu s6, 12(a0)
-; RV32I-NEXT:    lbu s7, 11(a0)
-; RV32I-NEXT:    lbu a1, 7(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s9, 8(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 9(a0)
-; RV32I-NEXT:    lbu ra, 6(a0)
-; RV32I-NEXT:    lbu a7, 5(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    sb s0, 29(a2)
-; RV32I-NEXT:    sb t3, 28(a2)
-; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    sb s3, 17(a2)
-; RV32I-NEXT:    sb s1, 16(a2)
-; RV32I-NEXT:    sb t6, 19(a2)
-; RV32I-NEXT:    sb s2, 18(a2)
-; RV32I-NEXT:    sb t4, 21(a2)
-; RV32I-NEXT:    sb t5, 20(a2)
-; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    sb s8, 22(a2)
-; RV32I-NEXT:    sb s11, 9(a2)
-; RV32I-NEXT:    sb s9, 8(a2)
-; RV32I-NEXT:    sb s7, 11(a2)
-; RV32I-NEXT:    sb s10, 10(a2)
-; RV32I-NEXT:    sb s5, 13(a2)
-; RV32I-NEXT:    sb s6, 12(a2)
-; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 15(a2)
-; RV32I-NEXT:    sb s4, 14(a2)
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sub t1, a0, a5
+; RV32I-NEXT:    lbu a0, 0(t1)
+; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 1(t1)
+; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 2(t1)
+; RV32I-NEXT:    lbu t0, 3(t1)
+; RV32I-NEXT:    lbu a0, 4(t1)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 5(t1)
+; RV32I-NEXT:    lbu a0, 6(t1)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(t1)
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t2, 8(t1)
+; RV32I-NEXT:    lbu t3, 9(t1)
+; RV32I-NEXT:    lbu t4, 10(t1)
+; RV32I-NEXT:    lbu t5, 11(t1)
+; RV32I-NEXT:    lbu t6, 12(t1)
+; RV32I-NEXT:    lbu s0, 13(t1)
+; RV32I-NEXT:    lbu s1, 14(t1)
+; RV32I-NEXT:    lbu s2, 15(t1)
+; RV32I-NEXT:    lbu s3, 16(t1)
+; RV32I-NEXT:    lbu s4, 17(t1)
+; RV32I-NEXT:    lbu s5, 18(t1)
+; RV32I-NEXT:    lbu s6, 19(t1)
+; RV32I-NEXT:    lbu s7, 20(t1)
+; RV32I-NEXT:    lbu s8, 21(t1)
+; RV32I-NEXT:    lbu s9, 22(t1)
+; RV32I-NEXT:    lbu s10, 23(t1)
+; RV32I-NEXT:    lbu s11, 24(t1)
+; RV32I-NEXT:    lbu ra, 25(t1)
+; RV32I-NEXT:    lbu a4, 26(t1)
+; RV32I-NEXT:    lbu a0, 27(t1)
+; RV32I-NEXT:    lbu a3, 28(t1)
+; RV32I-NEXT:    lbu a1, 29(t1)
+; RV32I-NEXT:    lbu a5, 30(t1)
+; RV32I-NEXT:    lbu t1, 31(t1)
+; RV32I-NEXT:    sb ra, 25(a2)
+; RV32I-NEXT:    sb s11, 24(a2)
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    sb a1, 29(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    sb s4, 17(a2)
+; RV32I-NEXT:    sb s3, 16(a2)
+; RV32I-NEXT:    sb s6, 19(a2)
+; RV32I-NEXT:    sb s5, 18(a2)
+; RV32I-NEXT:    sb s8, 21(a2)
+; RV32I-NEXT:    sb s7, 20(a2)
+; RV32I-NEXT:    sb s10, 23(a2)
+; RV32I-NEXT:    sb s9, 22(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s2, 15(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 1(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a6, 5(a2)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    sb ra, 6(a2)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
@@ -2176,10 +2176,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t3, a1
-; RV64I-NEXT:    lbu t2, 29(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    mv t1, a1
+; RV64I-NEXT:    lbu t0, 30(a0)
 ; RV64I-NEXT:    lbu a1, 0(a0)
 ; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a1, 1(a0)
@@ -2192,71 +2190,73 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a1, 5(a0)
 ; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t4, 6(a0)
-; RV64I-NEXT:    lbu t5, 7(a0)
-; RV64I-NEXT:    lbu t6, 8(a0)
-; RV64I-NEXT:    lbu s0, 9(a0)
-; RV64I-NEXT:    lbu s1, 10(a0)
-; RV64I-NEXT:    lbu s2, 11(a0)
-; RV64I-NEXT:    lbu s3, 12(a0)
-; RV64I-NEXT:    lbu s4, 13(a0)
-; RV64I-NEXT:    lbu s5, 14(a0)
-; RV64I-NEXT:    lbu s6, 15(a0)
-; RV64I-NEXT:    lbu s7, 16(a0)
-; RV64I-NEXT:    lbu s8, 17(a0)
-; RV64I-NEXT:    lbu s9, 18(a0)
-; RV64I-NEXT:    lbu s10, 19(a0)
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    lbu a7, 22(a0)
-; RV64I-NEXT:    lbu a6, 23(a0)
-; RV64I-NEXT:    lbu a5, 24(a0)
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a3, 26(a0)
-; RV64I-NEXT:    lbu a1, 27(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t3, 0(t3)
-; RV64I-NEXT:    sd t3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sb t1, 86(sp)
-; RV64I-NEXT:    sb t2, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a1, 83(sp)
-; RV64I-NEXT:    sb a3, 82(sp)
-; RV64I-NEXT:    sb a4, 81(sp)
-; RV64I-NEXT:    sb t0, 87(sp)
-; RV64I-NEXT:    slli t0, t0, 56
-; RV64I-NEXT:    sb a5, 80(sp)
-; RV64I-NEXT:    sb a6, 79(sp)
-; RV64I-NEXT:    sb a7, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
-; RV64I-NEXT:    sb s10, 75(sp)
-; RV64I-NEXT:    sb s9, 74(sp)
-; RV64I-NEXT:    sb s8, 73(sp)
-; RV64I-NEXT:    sb s7, 72(sp)
-; RV64I-NEXT:    sb s6, 71(sp)
-; RV64I-NEXT:    sb s5, 70(sp)
-; RV64I-NEXT:    sb s4, 69(sp)
-; RV64I-NEXT:    sb s3, 68(sp)
-; RV64I-NEXT:    sb s2, 67(sp)
-; RV64I-NEXT:    sb s1, 66(sp)
-; RV64I-NEXT:    sb s0, 65(sp)
-; RV64I-NEXT:    sb t6, 64(sp)
-; RV64I-NEXT:    sb t5, 63(sp)
-; RV64I-NEXT:    sb t4, 62(sp)
-; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t0, 63
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu t3, 7(a0)
+; RV64I-NEXT:    lbu t4, 8(a0)
+; RV64I-NEXT:    lbu t5, 9(a0)
+; RV64I-NEXT:    lbu t6, 10(a0)
+; RV64I-NEXT:    lbu s0, 11(a0)
+; RV64I-NEXT:    lbu s1, 12(a0)
+; RV64I-NEXT:    lbu s2, 13(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s4, 15(a0)
+; RV64I-NEXT:    lbu s5, 16(a0)
+; RV64I-NEXT:    lbu s6, 17(a0)
+; RV64I-NEXT:    lbu s7, 18(a0)
+; RV64I-NEXT:    lbu s8, 19(a0)
+; RV64I-NEXT:    lbu s9, 20(a0)
+; RV64I-NEXT:    lbu s10, 21(a0)
+; RV64I-NEXT:    lbu s11, 22(a0)
+; RV64I-NEXT:    lbu ra, 23(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a5, 26(a0)
+; RV64I-NEXT:    lbu a4, 27(a0)
+; RV64I-NEXT:    lbu a3, 28(a0)
+; RV64I-NEXT:    lbu a1, 29(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    lbu t1, 0(t1)
+; RV64I-NEXT:    sd t1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sb t0, 86(sp)
+; RV64I-NEXT:    sb a1, 85(sp)
+; RV64I-NEXT:    sb a3, 84(sp)
+; RV64I-NEXT:    sb a4, 83(sp)
+; RV64I-NEXT:    sb a5, 82(sp)
+; RV64I-NEXT:    sb a6, 81(sp)
+; RV64I-NEXT:    sb a0, 87(sp)
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    sb a7, 80(sp)
+; RV64I-NEXT:    sb ra, 79(sp)
+; RV64I-NEXT:    sb s11, 78(sp)
+; RV64I-NEXT:    sb s10, 77(sp)
+; RV64I-NEXT:    sb s9, 76(sp)
+; RV64I-NEXT:    sb s8, 75(sp)
+; RV64I-NEXT:    sb s7, 74(sp)
+; RV64I-NEXT:    sb s6, 73(sp)
+; RV64I-NEXT:    sb s5, 72(sp)
+; RV64I-NEXT:    sb s4, 71(sp)
+; RV64I-NEXT:    sb s3, 70(sp)
+; RV64I-NEXT:    sb s2, 69(sp)
+; RV64I-NEXT:    sb s1, 68(sp)
+; RV64I-NEXT:    sb s0, 67(sp)
+; RV64I-NEXT:    sb t6, 66(sp)
+; RV64I-NEXT:    sb t5, 65(sp)
+; RV64I-NEXT:    sb t4, 64(sp)
+; RV64I-NEXT:    sb t3, 63(sp)
+; RV64I-NEXT:    sb t2, 62(sp)
+; RV64I-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 61(sp)
+; RV64I-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 60(sp)
+; RV64I-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 59(sp)
+; RV64I-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 58(sp)
+; RV64I-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 57(sp)
+; RV64I-NEXT:    ld a1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 56(sp)
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2299,81 +2299,81 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    andi a0, a0, 31
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu t5, 25(a0)
-; RV64I-NEXT:    lbu t4, 26(a0)
-; RV64I-NEXT:    lbu t3, 27(a0)
-; RV64I-NEXT:    lbu t2, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 10(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 11(a0)
-; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 12(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 13(a0)
-; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t6, 14(a0)
-; RV64I-NEXT:    lbu s0, 15(a0)
-; RV64I-NEXT:    lbu s1, 16(a0)
-; RV64I-NEXT:    lbu s2, 17(a0)
-; RV64I-NEXT:    lbu s3, 18(a0)
-; RV64I-NEXT:    lbu s4, 19(a0)
-; RV64I-NEXT:    lbu s5, 20(a0)
-; RV64I-NEXT:    lbu s6, 21(a0)
-; RV64I-NEXT:    lbu s7, 22(a0)
-; RV64I-NEXT:    lbu s8, 24(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
-; RV64I-NEXT:    lbu s10, 0(a0)
-; RV64I-NEXT:    lbu s11, 1(a0)
-; RV64I-NEXT:    lbu ra, 2(a0)
-; RV64I-NEXT:    lbu a5, 3(a0)
-; RV64I-NEXT:    lbu a4, 4(a0)
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    lbu a1, 6(a0)
-; RV64I-NEXT:    lbu a6, 8(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    sb s9, 23(a2)
-; RV64I-NEXT:    sb s7, 22(a2)
-; RV64I-NEXT:    sb s6, 21(a2)
-; RV64I-NEXT:    sb s5, 20(a2)
-; RV64I-NEXT:    sb s4, 19(a2)
-; RV64I-NEXT:    sb s3, 18(a2)
-; RV64I-NEXT:    sb s2, 17(a2)
-; RV64I-NEXT:    sb s1, 16(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb t1, 29(a2)
-; RV64I-NEXT:    sb t2, 28(a2)
-; RV64I-NEXT:    sb t3, 27(a2)
-; RV64I-NEXT:    sb t4, 26(a2)
-; RV64I-NEXT:    sb t5, 25(a2)
-; RV64I-NEXT:    sb s8, 24(a2)
-; RV64I-NEXT:    sb a0, 7(a2)
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    sb a4, 4(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    sb ra, 2(a2)
-; RV64I-NEXT:    sb s11, 1(a2)
-; RV64I-NEXT:    sb s10, 0(a2)
-; RV64I-NEXT:    sb s0, 15(a2)
-; RV64I-NEXT:    sb t6, 14(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 13(a2)
+; RV64I-NEXT:    add t1, a1, a0
+; RV64I-NEXT:    lbu a0, 0(t1)
+; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 1(t1)
+; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a6, 2(t1)
+; RV64I-NEXT:    lbu t0, 3(t1)
+; RV64I-NEXT:    lbu t2, 4(t1)
+; RV64I-NEXT:    lbu t3, 5(t1)
+; RV64I-NEXT:    lbu t4, 6(t1)
+; RV64I-NEXT:    lbu t5, 7(t1)
+; RV64I-NEXT:    lbu a0, 8(t1)
+; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 9(t1)
+; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a0, 10(t1)
+; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 11(t1)
+; RV64I-NEXT:    lbu t6, 12(t1)
+; RV64I-NEXT:    lbu s0, 13(t1)
+; RV64I-NEXT:    lbu s1, 14(t1)
+; RV64I-NEXT:    lbu s2, 15(t1)
+; RV64I-NEXT:    lbu s3, 16(t1)
+; RV64I-NEXT:    lbu s4, 17(t1)
+; RV64I-NEXT:    lbu s5, 18(t1)
+; RV64I-NEXT:    lbu s6, 19(t1)
+; RV64I-NEXT:    lbu s7, 20(t1)
+; RV64I-NEXT:    lbu s8, 21(t1)
+; RV64I-NEXT:    lbu s9, 22(t1)
+; RV64I-NEXT:    lbu s10, 23(t1)
+; RV64I-NEXT:    lbu s11, 24(t1)
+; RV64I-NEXT:    lbu ra, 25(t1)
+; RV64I-NEXT:    lbu a5, 26(t1)
+; RV64I-NEXT:    lbu a4, 27(t1)
+; RV64I-NEXT:    lbu a3, 28(t1)
+; RV64I-NEXT:    lbu a1, 29(t1)
+; RV64I-NEXT:    lbu a0, 30(t1)
+; RV64I-NEXT:    lbu t1, 31(t1)
+; RV64I-NEXT:    sb s10, 23(a2)
+; RV64I-NEXT:    sb s9, 22(a2)
+; RV64I-NEXT:    sb s8, 21(a2)
+; RV64I-NEXT:    sb s7, 20(a2)
+; RV64I-NEXT:    sb s6, 19(a2)
+; RV64I-NEXT:    sb s5, 18(a2)
+; RV64I-NEXT:    sb s4, 17(a2)
+; RV64I-NEXT:    sb s3, 16(a2)
+; RV64I-NEXT:    sb t1, 31(a2)
+; RV64I-NEXT:    sb a0, 30(a2)
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    sb ra, 25(a2)
+; RV64I-NEXT:    sb s11, 24(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 4(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 1(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    sb s2, 15(a2)
+; RV64I-NEXT:    sb s1, 14(a2)
+; RV64I-NEXT:    sb s0, 13(a2)
+; RV64I-NEXT:    sb t6, 12(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    sb a6, 8(a2)
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -2406,10 +2406,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t3, a1
-; RV32I-NEXT:    lbu t2, 29(a0)
-; RV32I-NEXT:    lbu t0, 31(a0)
-; RV32I-NEXT:    lbu t1, 30(a0)
+; RV32I-NEXT:    mv t1, a1
+; RV32I-NEXT:    lbu t0, 30(a0)
 ; RV32I-NEXT:    lbu a1, 0(a0)
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 1(a0)
@@ -2422,71 +2420,73 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 5(a0)
 ; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu t5, 7(a0)
-; RV32I-NEXT:    lbu t6, 8(a0)
-; RV32I-NEXT:    lbu s0, 9(a0)
-; RV32I-NEXT:    lbu s1, 10(a0)
-; RV32I-NEXT:    lbu s2, 11(a0)
-; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    lbu s4, 13(a0)
-; RV32I-NEXT:    lbu s5, 14(a0)
-; RV32I-NEXT:    lbu s6, 15(a0)
-; RV32I-NEXT:    lbu s7, 16(a0)
-; RV32I-NEXT:    lbu s8, 17(a0)
-; RV32I-NEXT:    lbu s9, 18(a0)
-; RV32I-NEXT:    lbu s10, 19(a0)
-; RV32I-NEXT:    lbu s11, 20(a0)
-; RV32I-NEXT:    lbu ra, 21(a0)
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu a6, 23(a0)
-; RV32I-NEXT:    lbu a5, 24(a0)
-; RV32I-NEXT:    lbu a4, 25(a0)
-; RV32I-NEXT:    lbu a3, 26(a0)
-; RV32I-NEXT:    lbu a1, 27(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t3, 0(t3)
-; RV32I-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sb t1, 58(sp)
-; RV32I-NEXT:    sb t2, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a1, 55(sp)
-; RV32I-NEXT:    sb a3, 54(sp)
-; RV32I-NEXT:    sb a4, 53(sp)
-; RV32I-NEXT:    sb t0, 59(sp)
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    sb a5, 52(sp)
-; RV32I-NEXT:    sb a6, 51(sp)
-; RV32I-NEXT:    sb a7, 50(sp)
-; RV32I-NEXT:    sb ra, 49(sp)
-; RV32I-NEXT:    sb s11, 48(sp)
-; RV32I-NEXT:    sb s10, 47(sp)
-; RV32I-NEXT:    sb s9, 46(sp)
-; RV32I-NEXT:    sb s8, 45(sp)
-; RV32I-NEXT:    sb s7, 44(sp)
-; RV32I-NEXT:    sb s6, 43(sp)
-; RV32I-NEXT:    sb s5, 42(sp)
-; RV32I-NEXT:    sb s4, 41(sp)
-; RV32I-NEXT:    sb s3, 40(sp)
-; RV32I-NEXT:    sb s2, 39(sp)
-; RV32I-NEXT:    sb s1, 38(sp)
-; RV32I-NEXT:    sb s0, 37(sp)
-; RV32I-NEXT:    sb t6, 36(sp)
-; RV32I-NEXT:    sb t5, 35(sp)
-; RV32I-NEXT:    sb t4, 34(sp)
-; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t0, 31
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s3, 14(a0)
+; RV32I-NEXT:    lbu s4, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s6, 17(a0)
+; RV32I-NEXT:    lbu s7, 18(a0)
+; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    lbu s9, 20(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s11, 22(a0)
+; RV32I-NEXT:    lbu ra, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a5, 26(a0)
+; RV32I-NEXT:    lbu a4, 27(a0)
+; RV32I-NEXT:    lbu a3, 28(a0)
+; RV32I-NEXT:    lbu a1, 29(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu t1, 0(t1)
+; RV32I-NEXT:    sw t1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb a1, 57(sp)
+; RV32I-NEXT:    sb a3, 56(sp)
+; RV32I-NEXT:    sb a4, 55(sp)
+; RV32I-NEXT:    sb a5, 54(sp)
+; RV32I-NEXT:    sb a6, 53(sp)
+; RV32I-NEXT:    sb a0, 59(sp)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb ra, 51(sp)
+; RV32I-NEXT:    sb s11, 50(sp)
+; RV32I-NEXT:    sb s10, 49(sp)
+; RV32I-NEXT:    sb s9, 48(sp)
+; RV32I-NEXT:    sb s8, 47(sp)
+; RV32I-NEXT:    sb s7, 46(sp)
+; RV32I-NEXT:    sb s6, 45(sp)
+; RV32I-NEXT:    sb s5, 44(sp)
+; RV32I-NEXT:    sb s4, 43(sp)
+; RV32I-NEXT:    sb s3, 42(sp)
+; RV32I-NEXT:    sb s2, 41(sp)
+; RV32I-NEXT:    sb s1, 40(sp)
+; RV32I-NEXT:    sb s0, 39(sp)
+; RV32I-NEXT:    sb t6, 38(sp)
+; RV32I-NEXT:    sb t5, 37(sp)
+; RV32I-NEXT:    sb t4, 36(sp)
+; RV32I-NEXT:    sb t3, 35(sp)
+; RV32I-NEXT:    sb t2, 34(sp)
+; RV32I-NEXT:    lw a1, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 33(sp)
+; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 31(sp)
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 30(sp)
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -2525,81 +2525,81 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    andi a0, a0, 31
 ; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s0, 29(a0)
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a1, 23(a0)
-; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t0, 24(a0)
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t2, 25(a0)
-; RV32I-NEXT:    lbu s8, 22(a0)
-; RV32I-NEXT:    lbu t4, 21(a0)
-; RV32I-NEXT:    lbu t5, 20(a0)
-; RV32I-NEXT:    lbu t6, 19(a0)
-; RV32I-NEXT:    lbu a1, 15(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s1, 16(a0)
-; RV32I-NEXT:    lbu s2, 18(a0)
-; RV32I-NEXT:    lbu s3, 17(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu s6, 12(a0)
-; RV32I-NEXT:    lbu s7, 11(a0)
-; RV32I-NEXT:    lbu a1, 7(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu s9, 8(a0)
-; RV32I-NEXT:    lbu s10, 10(a0)
-; RV32I-NEXT:    lbu s11, 9(a0)
-; RV32I-NEXT:    lbu ra, 6(a0)
-; RV32I-NEXT:    lbu a7, 5(a0)
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    sb t0, 24(a2)
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    sb s0, 29(a2)
-; RV32I-NEXT:    sb t3, 28(a2)
-; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 31(a2)
-; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    sb s3, 17(a2)
-; RV32I-NEXT:    sb s1, 16(a2)
-; RV32I-NEXT:    sb t6, 19(a2)
-; RV32I-NEXT:    sb s2, 18(a2)
-; RV32I-NEXT:    sb t4, 21(a2)
-; RV32I-NEXT:    sb t5, 20(a2)
-; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    sb s8, 22(a2)
-; RV32I-NEXT:    sb s11, 9(a2)
-; RV32I-NEXT:    sb s9, 8(a2)
-; RV32I-NEXT:    sb s7, 11(a2)
-; RV32I-NEXT:    sb s10, 10(a2)
-; RV32I-NEXT:    sb s5, 13(a2)
-; RV32I-NEXT:    sb s6, 12(a2)
-; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a6, 15(a2)
-; RV32I-NEXT:    sb s4, 14(a2)
-; RV32I-NEXT:    sb a0, 3(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    sb a7, 5(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    add t1, a1, a0
+; RV32I-NEXT:    lbu a0, 0(t1)
+; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 1(t1)
+; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 2(t1)
+; RV32I-NEXT:    lbu t0, 3(t1)
+; RV32I-NEXT:    lbu a0, 4(t1)
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 5(t1)
+; RV32I-NEXT:    lbu a0, 6(t1)
+; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a0, 7(t1)
+; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t2, 8(t1)
+; RV32I-NEXT:    lbu t3, 9(t1)
+; RV32I-NEXT:    lbu t4, 10(t1)
+; RV32I-NEXT:    lbu t5, 11(t1)
+; RV32I-NEXT:    lbu t6, 12(t1)
+; RV32I-NEXT:    lbu s0, 13(t1)
+; RV32I-NEXT:    lbu s1, 14(t1)
+; RV32I-NEXT:    lbu s2, 15(t1)
+; RV32I-NEXT:    lbu s3, 16(t1)
+; RV32I-NEXT:    lbu s4, 17(t1)
+; RV32I-NEXT:    lbu s5, 18(t1)
+; RV32I-NEXT:    lbu s6, 19(t1)
+; RV32I-NEXT:    lbu s7, 20(t1)
+; RV32I-NEXT:    lbu s8, 21(t1)
+; RV32I-NEXT:    lbu s9, 22(t1)
+; RV32I-NEXT:    lbu s10, 23(t1)
+; RV32I-NEXT:    lbu s11, 24(t1)
+; RV32I-NEXT:    lbu ra, 25(t1)
+; RV32I-NEXT:    lbu a4, 26(t1)
+; RV32I-NEXT:    lbu a0, 27(t1)
+; RV32I-NEXT:    lbu a3, 28(t1)
+; RV32I-NEXT:    lbu a1, 29(t1)
+; RV32I-NEXT:    lbu a5, 30(t1)
+; RV32I-NEXT:    lbu t1, 31(t1)
+; RV32I-NEXT:    sb ra, 25(a2)
+; RV32I-NEXT:    sb s11, 24(a2)
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    sb a1, 29(a2)
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    sb t1, 31(a2)
+; RV32I-NEXT:    sb a5, 30(a2)
+; RV32I-NEXT:    sb s4, 17(a2)
+; RV32I-NEXT:    sb s3, 16(a2)
+; RV32I-NEXT:    sb s6, 19(a2)
+; RV32I-NEXT:    sb s5, 18(a2)
+; RV32I-NEXT:    sb s8, 21(a2)
+; RV32I-NEXT:    sb s7, 20(a2)
+; RV32I-NEXT:    sb s10, 23(a2)
+; RV32I-NEXT:    sb s9, 22(a2)
+; RV32I-NEXT:    sb t3, 9(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb t5, 11(a2)
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    sb s0, 13(a2)
+; RV32I-NEXT:    sb t6, 12(a2)
+; RV32I-NEXT:    sb s2, 15(a2)
+; RV32I-NEXT:    sb s1, 14(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 1(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a6, 5(a2)
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    sb ra, 6(a2)
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a0, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index c80c3e6834f67..3db42f1ba1a01 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -773,89 +773,89 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 8
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 4(a1)
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 9(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    lbu t3, 12(a1)
-; RV32I-NEXT:    lbu t4, 13(a1)
-; RV32I-NEXT:    lbu t5, 14(a1)
-; RV32I-NEXT:    lbu t6, 15(a1)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a6, 3(a1)
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl t0, a7, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    not t3, a0
+; RV32I-NEXT:    sll t2, t2, t3
+; RV32I-NEXT:    or t2, t0, t2
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    lbu s0, 0(a1)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a0, a0, 7
-; RV32I-NEXT:    srl a4, a3, a0
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a6, a5, 1
-; RV32I-NEXT:    not a7, a0
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    xori a4, a0, 31
+; RV32I-NEXT:    sll a5, a7, a4
+; RV32I-NEXT:    or a5, a3, a5
+; RV32I-NEXT:    srl a6, t1, a0
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, s2
 ; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    srl a1, a1, a0
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    xori a7, a0, 31
-; RV32I-NEXT:    sll a3, a3, a7
-; RV32I-NEXT:    or a3, a1, a3
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t0, t4, t3
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or t1, t6, t5
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli t1, t0, 1
-; RV32I-NEXT:    sll a7, t1, a7
-; RV32I-NEXT:    or a7, a5, a7
-; RV32I-NEXT:    srl a0, t0, a0
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    sll a4, a7, a4
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    srl a0, a1, a0
+; RV32I-NEXT:    sb a6, 8(a2)
 ; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    srli t0, a5, 16
-; RV32I-NEXT:    sb t0, 10(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    sb a5, 15(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
+; RV32I-NEXT:    srli a1, a6, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a6, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a0, a7, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    srli a0, t2, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
@@ -1058,90 +1058,90 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 24
 ; RV32I-NEXT:    sub a3, a3, a1
-; RV32I-NEXT:    lbu a1, 13(a3)
-; RV32I-NEXT:    lbu a4, 14(a3)
-; RV32I-NEXT:    lbu a5, 15(a3)
-; RV32I-NEXT:    lbu a6, 4(a3)
-; RV32I-NEXT:    lbu a7, 5(a3)
-; RV32I-NEXT:    lbu t0, 6(a3)
-; RV32I-NEXT:    lbu t1, 7(a3)
-; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 9(a3)
-; RV32I-NEXT:    lbu t4, 10(a3)
-; RV32I-NEXT:    lbu t5, 12(a3)
+; RV32I-NEXT:    lbu a1, 0(a3)
+; RV32I-NEXT:    lbu a4, 1(a3)
+; RV32I-NEXT:    lbu a5, 2(a3)
+; RV32I-NEXT:    lbu a6, 3(a3)
+; RV32I-NEXT:    lbu a7, 4(a3)
+; RV32I-NEXT:    lbu t0, 5(a3)
+; RV32I-NEXT:    lbu t1, 6(a3)
+; RV32I-NEXT:    lbu t2, 7(a3)
+; RV32I-NEXT:    lbu t3, 8(a3)
+; RV32I-NEXT:    lbu t4, 9(a3)
+; RV32I-NEXT:    lbu t5, 10(a3)
 ; RV32I-NEXT:    lbu t6, 11(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    lbu s0, 0(a3)
-; RV32I-NEXT:    lbu s1, 1(a3)
-; RV32I-NEXT:    lbu s2, 2(a3)
-; RV32I-NEXT:    lbu a3, 3(a3)
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu s0, 12(a3)
+; RV32I-NEXT:    lbu s1, 13(a3)
+; RV32I-NEXT:    lbu s2, 14(a3)
+; RV32I-NEXT:    lbu a3, 15(a3)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
 ; RV32I-NEXT:    andi a0, a0, 7
-; RV32I-NEXT:    sll a7, a6, a0
+; RV32I-NEXT:    sll t0, a7, a0
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    srli a4, a1, 1
+; RV32I-NEXT:    xori a5, a0, 31
+; RV32I-NEXT:    srl a4, a4, a5
+; RV32I-NEXT:    or a4, t0, a4
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a3, a3, 24
 ; RV32I-NEXT:    or a3, a3, s2
 ; RV32I-NEXT:    or a3, a3, s0
-; RV32I-NEXT:    srli t0, a3, 1
-; RV32I-NEXT:    xori t1, a0, 31
-; RV32I-NEXT:    srl t0, t0, t1
-; RV32I-NEXT:    or t0, a7, t0
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, t5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    sll a1, a1, a0
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or a4, t3, t2
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    sll a3, a3, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a6, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t6, t4
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    srli a5, a4, 1
-; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, a1, a5
-; RV32I-NEXT:    sll a4, a4, a0
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    srl a5, t1, a5
+; RV32I-NEXT:    or a5, a3, a5
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a7, a7, 1
 ; RV32I-NEXT:    not t1, a0
-; RV32I-NEXT:    srl a6, a6, t1
-; RV32I-NEXT:    or a6, a4, a6
-; RV32I-NEXT:    sll a0, a3, a0
+; RV32I-NEXT:    srl a7, a7, t1
+; RV32I-NEXT:    or a7, a6, a7
+; RV32I-NEXT:    sll a0, a1, a0
 ; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 10(a2)
-; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a6, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a1, a6, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 2(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 1(a2)
-; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    srli a0, t0, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    srli a0, t0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    srli a0, t0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
 ; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb t0, 4(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
@@ -1281,156 +1281,156 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 1(a1)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s3, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
 ; RV32I-NEXT:    lbu a0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    slli s3, s4, 24
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a4, s5, 24
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    or a0, a0, s2
-; RV32I-NEXT:    sb s4, 23(sp)
-; RV32I-NEXT:    sb s5, 22(sp)
-; RV32I-NEXT:    sb s1, 21(sp)
-; RV32I-NEXT:    sb s0, 20(sp)
-; RV32I-NEXT:    sb t6, 19(sp)
-; RV32I-NEXT:    sb t5, 18(sp)
-; RV32I-NEXT:    sb t4, 17(sp)
-; RV32I-NEXT:    sb t3, 16(sp)
-; RV32I-NEXT:    sb t2, 15(sp)
-; RV32I-NEXT:    sb t1, 14(sp)
-; RV32I-NEXT:    sb t0, 13(sp)
-; RV32I-NEXT:    sb a7, 12(sp)
-; RV32I-NEXT:    sb a6, 11(sp)
-; RV32I-NEXT:    sb a5, 10(sp)
-; RV32I-NEXT:    sb a4, 9(sp)
-; RV32I-NEXT:    sb a3, 8(sp)
-; RV32I-NEXT:    srai a1, s3, 31
-; RV32I-NEXT:    sb a1, 36(sp)
-; RV32I-NEXT:    sb a1, 32(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a1, 24(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 39(sp)
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    sb a4, 38(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 37(sp)
-; RV32I-NEXT:    sb a3, 35(sp)
-; RV32I-NEXT:    sb a4, 34(sp)
-; RV32I-NEXT:    sb a1, 33(sp)
-; RV32I-NEXT:    sb a3, 31(sp)
-; RV32I-NEXT:    sb a4, 30(sp)
-; RV32I-NEXT:    sb a1, 29(sp)
-; RV32I-NEXT:    sb a3, 27(sp)
-; RV32I-NEXT:    sb a4, 26(sp)
-; RV32I-NEXT:    sb a1, 25(sp)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    sb s5, 23(sp)
+; RV32I-NEXT:    sb s4, 22(sp)
+; RV32I-NEXT:    sb s3, 21(sp)
+; RV32I-NEXT:    sb s2, 20(sp)
+; RV32I-NEXT:    sb s1, 19(sp)
+; RV32I-NEXT:    sb s0, 18(sp)
+; RV32I-NEXT:    sb t6, 17(sp)
+; RV32I-NEXT:    sb t5, 16(sp)
+; RV32I-NEXT:    sb t4, 15(sp)
+; RV32I-NEXT:    sb t3, 14(sp)
+; RV32I-NEXT:    sb t2, 13(sp)
+; RV32I-NEXT:    sb t1, 12(sp)
+; RV32I-NEXT:    sb t0, 11(sp)
+; RV32I-NEXT:    sb a7, 10(sp)
+; RV32I-NEXT:    sb a6, 9(sp)
+; RV32I-NEXT:    sb a5, 8(sp)
+; RV32I-NEXT:    srai a4, a4, 31
+; RV32I-NEXT:    sb a4, 36(sp)
+; RV32I-NEXT:    sb a4, 32(sp)
+; RV32I-NEXT:    sb a4, 28(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 39(sp)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 38(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 37(sp)
+; RV32I-NEXT:    sb a1, 35(sp)
+; RV32I-NEXT:    sb a3, 34(sp)
+; RV32I-NEXT:    sb a4, 33(sp)
+; RV32I-NEXT:    sb a1, 31(sp)
+; RV32I-NEXT:    sb a3, 30(sp)
+; RV32I-NEXT:    sb a4, 29(sp)
+; RV32I-NEXT:    sb a1, 27(sp)
+; RV32I-NEXT:    sb a3, 26(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
 ; RV32I-NEXT:    slli a1, a0, 25
 ; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 8
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 4(a1)
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 9(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    lbu t3, 12(a1)
-; RV32I-NEXT:    lbu t4, 13(a1)
-; RV32I-NEXT:    lbu t5, 14(a1)
-; RV32I-NEXT:    lbu t6, 15(a1)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a6, 3(a1)
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl t0, a7, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t2, t6, t5
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    not t3, a0
+; RV32I-NEXT:    sll t2, t2, t3
+; RV32I-NEXT:    or t2, t0, t2
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    lbu s0, 0(a1)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a0, a0, 7
-; RV32I-NEXT:    srl a4, a3, a0
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a6, a5, 1
-; RV32I-NEXT:    not a7, a0
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    xori a4, a0, 31
+; RV32I-NEXT:    sll a5, a7, a4
+; RV32I-NEXT:    or a5, a3, a5
+; RV32I-NEXT:    srl a6, t1, a0
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, s2
 ; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    srl a1, a1, a0
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    xori a7, a0, 31
-; RV32I-NEXT:    sll a3, a3, a7
-; RV32I-NEXT:    or a3, a1, a3
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t0, t4, t3
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or t1, t6, t5
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli t1, t0, 1
-; RV32I-NEXT:    sll a7, t1, a7
-; RV32I-NEXT:    or a7, a5, a7
-; RV32I-NEXT:    sra a0, t0, a0
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    sll a4, a7, a4
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    sra a0, a1, a0
+; RV32I-NEXT:    sb a6, 8(a2)
 ; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    srli t0, a5, 16
-; RV32I-NEXT:    sb t0, 10(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    sb a5, 15(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
+; RV32I-NEXT:    srli a1, a6, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a6, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 13(a2)
-; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
 ; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a0, a7, 24
-; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    srli a0, t2, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
@@ -1530,6 +1530,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 84(sp)
 ; RV64I-NEXT:    sb a1, 83(sp)
 ; RV64I-NEXT:    sb a3, 82(sp)
+; RV64I-NEXT:    sb a4, 81(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1562,7 +1563,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 90(sp)
 ; RV64I-NEXT:    sb zero, 89(sp)
 ; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb a4, 81(sp)
 ; RV64I-NEXT:    sb a6, 80(sp)
 ; RV64I-NEXT:    sb a7, 79(sp)
 ; RV64I-NEXT:    sb t0, 78(sp)
@@ -1595,7 +1595,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
 ; RV64I-NEXT:    slli a0, a5, 56
-; RV64I-NEXT:    mv ra, a5
+; RV64I-NEXT:    mv s11, a5
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 56
 ; RV64I-NEXT:    add a0, a1, a0
@@ -1612,8 +1612,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t5, 18(a0)
 ; RV64I-NEXT:    lbu t6, 19(a0)
 ; RV64I-NEXT:    lbu s0, 20(a0)
-; RV64I-NEXT:    lbu s1, 21(a0)
-; RV64I-NEXT:    lbu a1, 22(a0)
+; RV64I-NEXT:    lbu a1, 21(a0)
+; RV64I-NEXT:    lbu s1, 22(a0)
 ; RV64I-NEXT:    lbu s2, 23(a0)
 ; RV64I-NEXT:    lbu s3, 24(a0)
 ; RV64I-NEXT:    lbu s4, 25(a0)
@@ -1624,7 +1624,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s9, 30(a0)
 ; RV64I-NEXT:    lbu s10, 31(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu s11, 7(a0)
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
@@ -1638,19 +1637,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a4, a4, a3
-; RV64I-NEXT:    andi a3, ra, 7
+; RV64I-NEXT:    andi a3, s11, 7
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    or a5, t4, t3
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
 ; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    slli a1, a1, 16
-; RV64I-NEXT:    slli s2, s2, 24
-; RV64I-NEXT:    or a1, s2, a1
+; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    or a6, s2, s1
+; RV64I-NEXT:    or a1, a6, a1
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a5
 ; RV64I-NEXT:    slli a1, a5, 1
@@ -1662,7 +1661,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t1, 3(a0)
 ; RV64I-NEXT:    lbu t2, 4(a0)
 ; RV64I-NEXT:    lbu t3, 5(a0)
-; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -1671,9 +1671,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t3, t3, 8
 ; RV64I-NEXT:    or a7, t3, t2
-; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    or a0, s11, a0
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a6, a0, a6
@@ -1925,8 +1925,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 8(a0)
 ; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 9(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 9(a0)
 ; RV32I-NEXT:    lbu t1, 10(a0)
 ; RV32I-NEXT:    lbu t2, 11(a0)
 ; RV32I-NEXT:    lbu t3, 12(a0)
@@ -1937,7 +1936,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s1, 17(a0)
 ; RV32I-NEXT:    lbu s2, 18(a0)
 ; RV32I-NEXT:    lbu s3, 19(a0)
-; RV32I-NEXT:    lbu t0, 20(a0)
+; RV32I-NEXT:    lbu a6, 20(a0)
 ; RV32I-NEXT:    lbu s5, 21(a0)
 ; RV32I-NEXT:    lbu s6, 22(a0)
 ; RV32I-NEXT:    lbu s7, 23(a0)
@@ -1948,38 +1947,37 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 28(a0)
 ; RV32I-NEXT:    lbu a5, 29(a0)
 ; RV32I-NEXT:    lbu ra, 30(a0)
-; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a3, 31(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    lbu a3, 3(a0)
 ; RV32I-NEXT:    or a4, a4, a1
 ; RV32I-NEXT:    slli a1, a7, 16
 ; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    or a4, a1, a4
-; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    lw a7, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, t0, a1
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t0, 0(a0)
 ; RV32I-NEXT:    lbu t1, 1(a0)
-; RV32I-NEXT:    lbu t2, 0(a0)
-; RV32I-NEXT:    lbu a0, 2(a0)
 ; RV32I-NEXT:    or a7, a7, a1
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or a1, t1, t2
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a1, a0, t0
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    or a0, t4, t3
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a3, t6, t5
-; RV32I-NEXT:    or t1, a3, a0
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or t1, t0, a0
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    slli s2, s2, 16
@@ -1987,14 +1985,14 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, s3, s2
 ; RV32I-NEXT:    or s0, a0, s0
 ; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    or a0, s5, t0
-; RV32I-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    andi t2, a3, 7
+; RV32I-NEXT:    or a0, s5, a6
+; RV32I-NEXT:    lw a6, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t2, a6, 7
 ; RV32I-NEXT:    slli s6, s6, 16
 ; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    or a3, s7, s6
+; RV32I-NEXT:    or a6, s7, s6
 ; RV32I-NEXT:    slli t0, a7, 1
-; RV32I-NEXT:    or t3, a3, a0
+; RV32I-NEXT:    or t3, a6, a0
 ; RV32I-NEXT:    not t4, t2
 ; RV32I-NEXT:    sll a0, t0, t4
 ; RV32I-NEXT:    slli s8, s8, 8
@@ -2003,8 +2001,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli s9, s9, 16
 ; RV32I-NEXT:    slli s10, s10, 24
 ; RV32I-NEXT:    or t6, s10, s9
-; RV32I-NEXT:    slli a3, s0, 1
-; RV32I-NEXT:    sll a3, a3, t4
+; RV32I-NEXT:    slli a6, s0, 1
+; RV32I-NEXT:    sll a6, a6, t4
 ; RV32I-NEXT:    or t6, t6, t0
 ; RV32I-NEXT:    slli t0, t6, 1
 ; RV32I-NEXT:    sll t4, t0, t4
@@ -2015,13 +2013,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli t5, t1, 1
 ; RV32I-NEXT:    sll t5, t5, s1
 ; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a6, a6, ra
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, ra
 ; RV32I-NEXT:    slli s2, t3, 1
 ; RV32I-NEXT:    sll s2, s2, s1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a6, a5, 1
-; RV32I-NEXT:    sll a6, a6, s1
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    slli a5, a3, 1
+; RV32I-NEXT:    sll a5, a5, s1
 ; RV32I-NEXT:    srl a4, a4, t2
 ; RV32I-NEXT:    srl a1, a1, t2
 ; RV32I-NEXT:    srl t1, t1, t2
@@ -2029,23 +2027,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl t3, t3, t2
 ; RV32I-NEXT:    srl s0, s0, t2
 ; RV32I-NEXT:    srl t6, t6, t2
-; RV32I-NEXT:    srl a5, a5, t2
+; RV32I-NEXT:    srl a3, a3, t2
 ; RV32I-NEXT:    srli t2, t6, 16
 ; RV32I-NEXT:    sb t2, 26(a2)
-; RV32I-NEXT:    or a6, t6, a6
+; RV32I-NEXT:    or a5, t6, a5
 ; RV32I-NEXT:    sb t6, 24(a2)
 ; RV32I-NEXT:    srli t2, t6, 8
 ; RV32I-NEXT:    sb t2, 25(a2)
-; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    srli t2, a3, 24
 ; RV32I-NEXT:    sb t2, 31(a2)
-; RV32I-NEXT:    srli t2, a5, 16
+; RV32I-NEXT:    srli t2, a3, 16
 ; RV32I-NEXT:    sb t2, 30(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 29(a2)
-; RV32I-NEXT:    srli a5, s0, 16
-; RV32I-NEXT:    sb a5, 18(a2)
-; RV32I-NEXT:    or a5, s0, s2
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 29(a2)
+; RV32I-NEXT:    srli a3, s0, 16
+; RV32I-NEXT:    sb a3, 18(a2)
+; RV32I-NEXT:    or a3, s0, s2
 ; RV32I-NEXT:    sb s0, 16(a2)
 ; RV32I-NEXT:    srli s0, s0, 8
 ; RV32I-NEXT:    sb s0, 17(a2)
@@ -2063,7 +2061,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    srli a7, t1, 16
 ; RV32I-NEXT:    sb a7, 14(a2)
-; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    sb t1, 12(a2)
 ; RV32I-NEXT:    srli a7, t1, 8
 ; RV32I-NEXT:    sb a7, 13(a2)
@@ -2079,16 +2077,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 27(a2)
 ; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    sb a5, 27(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 19(a2)
 ; RV32I-NEXT:    srli a1, t2, 24
 ; RV32I-NEXT:    sb a1, 23(a2)
 ; RV32I-NEXT:    srli a1, t3, 24
 ; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a1, a7, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
@@ -2265,120 +2263,120 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 88
 ; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a3, 25(a0)
-; RV64I-NEXT:    lbu a4, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a6, 28(a0)
-; RV64I-NEXT:    lbu a1, 29(a0)
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu t0, 31(a0)
-; RV64I-NEXT:    lbu s2, 8(a0)
-; RV64I-NEXT:    lbu s3, 9(a0)
-; RV64I-NEXT:    lbu s4, 10(a0)
-; RV64I-NEXT:    lbu s5, 11(a0)
-; RV64I-NEXT:    lbu s6, 12(a0)
-; RV64I-NEXT:    lbu s7, 13(a0)
-; RV64I-NEXT:    lbu s8, 14(a0)
-; RV64I-NEXT:    lbu s9, 15(a0)
-; RV64I-NEXT:    lbu t1, 16(a0)
-; RV64I-NEXT:    lbu t2, 17(a0)
-; RV64I-NEXT:    lbu t3, 18(a0)
-; RV64I-NEXT:    lbu t5, 19(a0)
-; RV64I-NEXT:    lbu t4, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s0, 22(a0)
-; RV64I-NEXT:    lbu s1, 23(a0)
-; RV64I-NEXT:    slli s3, s3, 8
-; RV64I-NEXT:    lbu s10, 7(a0)
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    slli s5, s5, 24
-; RV64I-NEXT:    or s3, s5, s4
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    slli s7, s7, 8
-; RV64I-NEXT:    or s3, s7, s6
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    or s4, s9, s8
-; RV64I-NEXT:    or s3, s4, s3
-; RV64I-NEXT:    slli s3, s3, 32
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    lbu s3, 0(a0)
-; RV64I-NEXT:    lbu s4, 1(a0)
-; RV64I-NEXT:    lbu s5, 2(a0)
-; RV64I-NEXT:    lbu s6, 3(a0)
-; RV64I-NEXT:    lbu s7, 4(a0)
-; RV64I-NEXT:    lbu s8, 5(a0)
-; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu a1, 16(a0)
+; RV64I-NEXT:    lbu t3, 17(a0)
+; RV64I-NEXT:    lbu t4, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu t5, 20(a0)
+; RV64I-NEXT:    lbu s0, 21(a0)
+; RV64I-NEXT:    lbu s1, 22(a0)
+; RV64I-NEXT:    lbu s2, 23(a0)
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s4, 25(a0)
+; RV64I-NEXT:    lbu s5, 26(a0)
+; RV64I-NEXT:    lbu s6, 27(a0)
+; RV64I-NEXT:    lbu s7, 28(a0)
+; RV64I-NEXT:    lbu s8, 29(a0)
+; RV64I-NEXT:    lbu s9, 30(a0)
+; RV64I-NEXT:    lbu s10, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 1(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t2
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a4, a0, a4
 ; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    or a0, s4, s3
 ; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    or a5, s6, s5
+; RV64I-NEXT:    or a0, a5, a0
 ; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s4, s8, s7
+; RV64I-NEXT:    or a5, s8, s7
 ; RV64I-NEXT:    slli s9, s9, 16
 ; RV64I-NEXT:    slli s10, s10, 24
-; RV64I-NEXT:    or s5, s10, s9
-; RV64I-NEXT:    or s4, s5, s4
-; RV64I-NEXT:    lbu a0, 24(a0)
-; RV64I-NEXT:    slli s4, s4, 32
-; RV64I-NEXT:    or s3, s4, s3
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a3, t0, a7
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a3, a1, a0
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or a0, t2, t1
-; RV64I-NEXT:    slli t3, t3, 16
-; RV64I-NEXT:    slli t5, t5, 24
-; RV64I-NEXT:    or a1, t5, t3
-; RV64I-NEXT:    or a1, a1, a0
-; RV64I-NEXT:    andi a4, s11, 7
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    or a5, t6, t4
-; RV64I-NEXT:    srli a0, s3, 1
-; RV64I-NEXT:    slli s0, s0, 16
-; RV64I-NEXT:    slli s1, s1, 24
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    xori a6, a4, 63
-; RV64I-NEXT:    srl a0, a0, a6
-; RV64I-NEXT:    or a5, s0, a5
+; RV64I-NEXT:    or a6, s10, s9
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a1
-; RV64I-NEXT:    srli a1, a5, 1
-; RV64I-NEXT:    srl a6, a1, a6
-; RV64I-NEXT:    srli a1, s2, 1
-; RV64I-NEXT:    not a7, a4
-; RV64I-NEXT:    srl a7, a1, a7
-; RV64I-NEXT:    sll a1, s2, a4
-; RV64I-NEXT:    sll a3, a3, a4
-; RV64I-NEXT:    sll a5, a5, a4
-; RV64I-NEXT:    sll a4, s3, a4
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    sb t0, 23(a2)
-; RV64I-NEXT:    srli t0, a5, 48
-; RV64I-NEXT:    sb t0, 22(a2)
-; RV64I-NEXT:    srli t0, a5, 40
-; RV64I-NEXT:    sb t0, 21(a2)
-; RV64I-NEXT:    srli t0, a5, 32
-; RV64I-NEXT:    sb t0, 20(a2)
-; RV64I-NEXT:    srli t0, a5, 24
-; RV64I-NEXT:    sb t0, 19(a2)
-; RV64I-NEXT:    srli t0, a5, 16
-; RV64I-NEXT:    sb t0, 18(a2)
-; RV64I-NEXT:    or a7, a5, a7
+; RV64I-NEXT:    or a5, a5, a0
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a0, t3, a1
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a1, t6, t4
+; RV64I-NEXT:    or a1, a1, a0
+; RV64I-NEXT:    andi a6, s11, 7
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    or a7, s0, t5
+; RV64I-NEXT:    srli a0, a4, 1
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    or t0, s2, s1
+; RV64I-NEXT:    xori t1, a6, 63
+; RV64I-NEXT:    srl a0, a0, t1
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a1
+; RV64I-NEXT:    srli a1, a7, 1
+; RV64I-NEXT:    srl t0, a1, t1
+; RV64I-NEXT:    srli a1, a3, 1
+; RV64I-NEXT:    not t1, a6
+; RV64I-NEXT:    srl t1, a1, t1
+; RV64I-NEXT:    sll a1, a3, a6
+; RV64I-NEXT:    sll a3, a5, a6
+; RV64I-NEXT:    sll a5, a7, a6
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    sb a6, 23(a2)
+; RV64I-NEXT:    srli a6, a5, 48
+; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, a5, 40
+; RV64I-NEXT:    sb a6, 21(a2)
+; RV64I-NEXT:    srli a6, a5, 32
+; RV64I-NEXT:    sb a6, 20(a2)
+; RV64I-NEXT:    srli a6, a5, 24
+; RV64I-NEXT:    sb a6, 19(a2)
+; RV64I-NEXT:    srli a6, a5, 16
+; RV64I-NEXT:    sb a6, 18(a2)
+; RV64I-NEXT:    or a6, a5, t1
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
 ; RV64I-NEXT:    srli a5, a3, 56
@@ -2393,7 +2391,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a5, 27(a2)
 ; RV64I-NEXT:    srli a5, a3, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    or a5, a3, a6
+; RV64I-NEXT:    or a5, a3, t0
 ; RV64I-NEXT:    srli a3, a3, 8
 ; RV64I-NEXT:    sb a3, 25(a2)
 ; RV64I-NEXT:    srli a3, a4, 56
@@ -2426,7 +2424,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a7, 16(a2)
+; RV64I-NEXT:    sb a6, 16(a2)
 ; RV64I-NEXT:    sb a5, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
@@ -2584,175 +2582,177 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 27
 ; RV32I-NEXT:    addi a1, sp, 60
 ; RV32I-NEXT:    sub a0, a1, a0
-; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a1, 4(a0)
 ; RV32I-NEXT:    lbu a4, 5(a0)
 ; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
-; RV32I-NEXT:    lbu a1, 8(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    sw a6, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    sw a6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 10(a0)
-; RV32I-NEXT:    lbu t4, 13(a0)
-; RV32I-NEXT:    lbu t2, 14(a0)
-; RV32I-NEXT:    lbu t3, 15(a0)
-; RV32I-NEXT:    lbu a7, 16(a0)
-; RV32I-NEXT:    lbu t6, 17(a0)
-; RV32I-NEXT:    lbu t5, 18(a0)
-; RV32I-NEXT:    lbu s1, 29(a0)
-; RV32I-NEXT:    lbu s0, 30(a0)
-; RV32I-NEXT:    lbu s2, 31(a0)
-; RV32I-NEXT:    lbu s7, 21(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu t4, 14(a0)
+; RV32I-NEXT:    lbu t5, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu t0, 17(a0)
+; RV32I-NEXT:    lbu s1, 18(a0)
+; RV32I-NEXT:    lbu s2, 19(a0)
+; RV32I-NEXT:    lbu s5, 20(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
 ; RV32I-NEXT:    lbu s8, 22(a0)
 ; RV32I-NEXT:    lbu s9, 23(a0)
 ; RV32I-NEXT:    lbu s3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
+; RV32I-NEXT:    lbu s6, 25(a0)
 ; RV32I-NEXT:    lbu s4, 26(a0)
-; RV32I-NEXT:    lbu s6, 27(a0)
-; RV32I-NEXT:    lbu s10, 19(a0)
-; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu s7, 27(a0)
+; RV32I-NEXT:    lbu s11, 28(a0)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu a6, 30(a0)
+; RV32I-NEXT:    lbu a7, 31(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    lbu ra, 3(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 1(a0)
-; RV32I-NEXT:    lbu a6, 0(a0)
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a3, a5, a6
-; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    lbu a5, 12(a0)
-; RV32I-NEXT:    or a1, ra, a1
-; RV32I-NEXT:    or a3, a1, a3
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or a1, t4, a5
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or a5, t3, t2
-; RV32I-NEXT:    or a6, a5, a1
-; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, a5, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a5, a1, a4
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a4, a0, a1
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or a0, t6, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a1, t5, t4
+; RV32I-NEXT:    or t3, a1, a0
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a0, a0, 8
 ; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, t0, a1
-; RV32I-NEXT:    lbu a5, 20(a0)
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or t0, s11, t1
-; RV32I-NEXT:    or t0, t0, a1
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or a1, s7, a5
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a1, t2, t1
+; RV32I-NEXT:    or t1, a1, a0
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or a0, s10, s5
 ; RV32I-NEXT:    slli s8, s8, 16
 ; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    or a5, s9, s8
-; RV32I-NEXT:    or t1, a5, a1
-; RV32I-NEXT:    slli t6, t6, 8
-; RV32I-NEXT:    or a1, t6, a7
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    andi a7, a0, 7
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or a0, s10, t5
-; RV32I-NEXT:    srli t2, a3, 1
-; RV32I-NEXT:    or t3, a0, a1
-; RV32I-NEXT:    xori t4, a7, 31
-; RV32I-NEXT:    srl a0, t2, t4
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a5, s1, a5
-; RV32I-NEXT:    srli a1, t0, 1
-; RV32I-NEXT:    srl a1, a1, t4
-; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    or a1, s9, s8
+; RV32I-NEXT:    or t2, a1, a0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a0, t0, s0
+; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t0, a1, 7
+; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t2, s2, s0
-; RV32I-NEXT:    srli t5, a4, 1
-; RV32I-NEXT:    or t2, t2, a5
-; RV32I-NEXT:    not t6, a7
-; RV32I-NEXT:    srl a5, t5, t6
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    or t5, s5, s3
-; RV32I-NEXT:    srli s0, t3, 1
-; RV32I-NEXT:    srl s0, s0, t4
+; RV32I-NEXT:    or a1, s2, s1
+; RV32I-NEXT:    srli a3, a4, 1
+; RV32I-NEXT:    or t4, a1, a0
+; RV32I-NEXT:    xori t5, t0, 31
+; RV32I-NEXT:    srl a0, a3, t5
+; RV32I-NEXT:    slli ra, ra, 8
+; RV32I-NEXT:    or a3, ra, s11
+; RV32I-NEXT:    srli a1, t1, 1
+; RV32I-NEXT:    srl a1, a1, t5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    srli a7, a5, 1
+; RV32I-NEXT:    or t6, a6, a3
+; RV32I-NEXT:    not a3, t0
+; RV32I-NEXT:    srl a6, a7, a3
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or a7, s6, s3
+; RV32I-NEXT:    srli s0, t4, 1
+; RV32I-NEXT:    srl s0, s0, t5
 ; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    or s1, s6, s4
-; RV32I-NEXT:    srli s2, a6, 1
-; RV32I-NEXT:    srl s2, s2, t6
-; RV32I-NEXT:    or t5, s1, t5
-; RV32I-NEXT:    srli s1, t5, 1
-; RV32I-NEXT:    srl t4, s1, t4
-; RV32I-NEXT:    srli s1, t1, 1
-; RV32I-NEXT:    srl t6, s1, t6
-; RV32I-NEXT:    sll a4, a4, a7
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    sll t0, t0, a7
-; RV32I-NEXT:    sll t1, t1, a7
-; RV32I-NEXT:    sll t3, t3, a7
-; RV32I-NEXT:    sll t2, t2, a7
-; RV32I-NEXT:    sll t5, t5, a7
-; RV32I-NEXT:    sll a3, a3, a7
-; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 27(a2)
-; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 26(a2)
-; RV32I-NEXT:    or a7, t5, t6
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, t2, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, t2, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    or t4, t2, t4
-; RV32I-NEXT:    srli t2, t2, 8
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 19(a2)
-; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 18(a2)
-; RV32I-NEXT:    or t2, t3, s2
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or s1, s7, s4
+; RV32I-NEXT:    srli s2, t3, 1
+; RV32I-NEXT:    srl s2, s2, a3
+; RV32I-NEXT:    or a7, s1, a7
+; RV32I-NEXT:    srli s1, a7, 1
+; RV32I-NEXT:    srl t5, s1, t5
+; RV32I-NEXT:    srli s1, t2, 1
+; RV32I-NEXT:    srl s1, s1, a3
+; RV32I-NEXT:    sll a3, a5, t0
+; RV32I-NEXT:    sll a5, t3, t0
+; RV32I-NEXT:    sll t1, t1, t0
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    sll t3, t4, t0
+; RV32I-NEXT:    sll t4, t6, t0
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    sll a4, a4, t0
+; RV32I-NEXT:    srli t0, a7, 24
+; RV32I-NEXT:    sb t0, 27(a2)
+; RV32I-NEXT:    srli t0, a7, 16
+; RV32I-NEXT:    sb t0, 26(a2)
+; RV32I-NEXT:    or t0, a7, s1
+; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    sb a7, 25(a2)
+; RV32I-NEXT:    srli a7, t4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
+; RV32I-NEXT:    or a7, t4, t5
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb t4, 29(a2)
+; RV32I-NEXT:    srli t4, t3, 24
+; RV32I-NEXT:    sb t4, 19(a2)
+; RV32I-NEXT:    srli t4, t3, 16
+; RV32I-NEXT:    sb t4, 18(a2)
+; RV32I-NEXT:    or t4, t3, s2
 ; RV32I-NEXT:    srli t3, t3, 8
 ; RV32I-NEXT:    sb t3, 17(a2)
-; RV32I-NEXT:    srli t3, t1, 24
+; RV32I-NEXT:    srli t3, t2, 24
 ; RV32I-NEXT:    sb t3, 23(a2)
-; RV32I-NEXT:    srli t3, t1, 16
+; RV32I-NEXT:    srli t3, t2, 16
 ; RV32I-NEXT:    sb t3, 22(a2)
-; RV32I-NEXT:    or t3, t1, s0
+; RV32I-NEXT:    or t3, t2, s0
+; RV32I-NEXT:    srli t2, t2, 8
+; RV32I-NEXT:    sb t2, 21(a2)
+; RV32I-NEXT:    srli t2, t1, 24
+; RV32I-NEXT:    sb t2, 11(a2)
+; RV32I-NEXT:    srli t2, t1, 16
+; RV32I-NEXT:    sb t2, 10(a2)
+; RV32I-NEXT:    or a6, t1, a6
 ; RV32I-NEXT:    srli t1, t1, 8
-; RV32I-NEXT:    sb t1, 21(a2)
-; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    sb t1, 11(a2)
-; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    sb t1, 10(a2)
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 9(a2)
-; RV32I-NEXT:    srli t0, a6, 24
-; RV32I-NEXT:    sb t0, 15(a2)
-; RV32I-NEXT:    srli t0, a6, 16
-; RV32I-NEXT:    sb t0, 14(a2)
-; RV32I-NEXT:    or a1, a6, a1
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 3(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 2(a2)
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 7(a2)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sb t1, 9(a2)
+; RV32I-NEXT:    srli t1, a5, 24
+; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    srli t1, a5, 16
+; RV32I-NEXT:    sb t1, 14(a2)
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 13(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 3(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 2(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a7, 24(a2)
-; RV32I-NEXT:    sb t4, 28(a2)
-; RV32I-NEXT:    sb t2, 16(a2)
+; RV32I-NEXT:    sb a4, 1(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 6(a2)
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 5(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t4, 16(a2)
 ; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a6, 8(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
@@ -2793,107 +2793,107 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 28(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t6, 31(a0)
+; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    lbu t1, 2(a1)
-; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    lbu t3, 4(a1)
-; RV64I-NEXT:    lbu t4, 5(a1)
-; RV64I-NEXT:    lbu t5, 6(a1)
+; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    lbu t3, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    slli t4, t4, 8
-; RV64I-NEXT:    or t0, t4, t3
-; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t5
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
 ; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t2, 3(a0)
-; RV64I-NEXT:    lbu t3, 4(a0)
-; RV64I-NEXT:    lbu t4, 5(a0)
-; RV64I-NEXT:    lbu t5, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
-; RV64I-NEXT:    lbu s0, 8(a0)
-; RV64I-NEXT:    lbu s1, 9(a0)
-; RV64I-NEXT:    lbu s2, 10(a0)
-; RV64I-NEXT:    lbu s3, 11(a0)
-; RV64I-NEXT:    lbu s4, 12(a0)
-; RV64I-NEXT:    lbu s5, 13(a0)
-; RV64I-NEXT:    lbu s6, 14(a0)
-; RV64I-NEXT:    lbu s7, 15(a0)
-; RV64I-NEXT:    lbu s8, 16(a0)
-; RV64I-NEXT:    lbu s9, 17(a0)
-; RV64I-NEXT:    lbu s10, 18(a0)
-; RV64I-NEXT:    lbu s11, 19(a0)
-; RV64I-NEXT:    lbu ra, 20(a0)
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu a5, 23(a0)
-; RV64I-NEXT:    lbu a4, 24(a0)
-; RV64I-NEXT:    lbu a3, 25(a0)
-; RV64I-NEXT:    lbu a1, 26(a0)
-; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    ld t1, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb t1, 86(sp)
-; RV64I-NEXT:    ld t1, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb t1, 85(sp)
-; RV64I-NEXT:    ld t1, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb t1, 84(sp)
-; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a1, 82(sp)
-; RV64I-NEXT:    sb a3, 81(sp)
-; RV64I-NEXT:    sb t6, 87(sp)
-; RV64I-NEXT:    slli t6, t6, 56
-; RV64I-NEXT:    sb a4, 80(sp)
-; RV64I-NEXT:    sb a5, 79(sp)
-; RV64I-NEXT:    sb a6, 78(sp)
-; RV64I-NEXT:    sb a7, 77(sp)
-; RV64I-NEXT:    sb ra, 76(sp)
-; RV64I-NEXT:    sb s11, 75(sp)
-; RV64I-NEXT:    sb s10, 74(sp)
-; RV64I-NEXT:    sb s9, 73(sp)
-; RV64I-NEXT:    sb s8, 72(sp)
-; RV64I-NEXT:    sb s7, 71(sp)
-; RV64I-NEXT:    sb s6, 70(sp)
-; RV64I-NEXT:    sb s5, 69(sp)
-; RV64I-NEXT:    sb s4, 68(sp)
-; RV64I-NEXT:    sb s3, 67(sp)
-; RV64I-NEXT:    sb s2, 66(sp)
-; RV64I-NEXT:    sb s1, 65(sp)
-; RV64I-NEXT:    sb s0, 64(sp)
-; RV64I-NEXT:    sb t0, 63(sp)
-; RV64I-NEXT:    sb t5, 62(sp)
-; RV64I-NEXT:    sb t4, 61(sp)
-; RV64I-NEXT:    sb t3, 60(sp)
-; RV64I-NEXT:    sb t2, 59(sp)
-; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t6, 63
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu t4, 7(a0)
+; RV64I-NEXT:    lbu t5, 8(a0)
+; RV64I-NEXT:    lbu t6, 9(a0)
+; RV64I-NEXT:    lbu s0, 10(a0)
+; RV64I-NEXT:    lbu s1, 11(a0)
+; RV64I-NEXT:    lbu s2, 12(a0)
+; RV64I-NEXT:    lbu s3, 13(a0)
+; RV64I-NEXT:    lbu s4, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s6, 16(a0)
+; RV64I-NEXT:    lbu s7, 17(a0)
+; RV64I-NEXT:    lbu s8, 18(a0)
+; RV64I-NEXT:    lbu s9, 19(a0)
+; RV64I-NEXT:    lbu s10, 20(a0)
+; RV64I-NEXT:    lbu s11, 21(a0)
+; RV64I-NEXT:    lbu ra, 22(a0)
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    lbu a6, 24(a0)
+; RV64I-NEXT:    lbu a5, 25(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    ld t0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t0, 86(sp)
+; RV64I-NEXT:    ld t0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t0, 85(sp)
+; RV64I-NEXT:    sb a1, 84(sp)
+; RV64I-NEXT:    sb a3, 83(sp)
+; RV64I-NEXT:    sb a4, 82(sp)
+; RV64I-NEXT:    sb a5, 81(sp)
+; RV64I-NEXT:    sb a6, 80(sp)
+; RV64I-NEXT:    sb a0, 87(sp)
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    sb a7, 79(sp)
+; RV64I-NEXT:    sb ra, 78(sp)
+; RV64I-NEXT:    sb s11, 77(sp)
+; RV64I-NEXT:    sb s10, 76(sp)
+; RV64I-NEXT:    sb s9, 75(sp)
+; RV64I-NEXT:    sb s8, 74(sp)
+; RV64I-NEXT:    sb s7, 73(sp)
+; RV64I-NEXT:    sb s6, 72(sp)
+; RV64I-NEXT:    sb s5, 71(sp)
+; RV64I-NEXT:    sb s4, 70(sp)
+; RV64I-NEXT:    sb s3, 69(sp)
+; RV64I-NEXT:    sb s2, 68(sp)
+; RV64I-NEXT:    sb s1, 67(sp)
+; RV64I-NEXT:    sb s0, 66(sp)
+; RV64I-NEXT:    sb t6, 65(sp)
+; RV64I-NEXT:    sb t5, 64(sp)
+; RV64I-NEXT:    sb t4, 63(sp)
+; RV64I-NEXT:    sb t3, 62(sp)
+; RV64I-NEXT:    sb t2, 61(sp)
+; RV64I-NEXT:    sb t1, 60(sp)
+; RV64I-NEXT:    ld a1, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 59(sp)
+; RV64I-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 58(sp)
+; RV64I-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 57(sp)
+; RV64I-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 56(sp)
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2933,8 +2933,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a7, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    slli a0, a7, 56
+; RV64I-NEXT:    ld ra, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    slli a0, ra, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 56
 ; RV64I-NEXT:    add a0, a1, a0
@@ -2942,8 +2942,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    lbu a1, 12(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a7, 12(a0)
 ; RV64I-NEXT:    lbu t0, 13(a0)
 ; RV64I-NEXT:    lbu t1, 14(a0)
 ; RV64I-NEXT:    lbu t2, 15(a0)
@@ -2964,22 +2963,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s10, 30(a0)
 ; RV64I-NEXT:    lbu s11, 31(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu ra, 7(a0)
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a1, a6, a5
 ; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    ld a3, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    or a3, t0, a3
+; RV64I-NEXT:    or a3, t0, a7
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or a4, t2, t1
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a3, a3, 32
 ; RV64I-NEXT:    or a4, a3, a1
-; RV64I-NEXT:    andi a3, a7, 7
+; RV64I-NEXT:    andi a3, ra, 7
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    or a1, t4, t3
 ; RV64I-NEXT:    slli t5, t5, 16
@@ -3003,7 +3000,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t1, 3(a0)
 ; RV64I-NEXT:    lbu t2, 4(a0)
 ; RV64I-NEXT:    lbu t3, 5(a0)
-; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -3012,9 +3010,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t3, t3, 8
 ; RV64I-NEXT:    or a7, t3, t2
-; RV64I-NEXT:    slli a0, a0, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    or a0, ra, a0
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a6, a0, a6
@@ -3136,19 +3134,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t2, 29(a0)
 ; RV32I-NEXT:    lbu a3, 0(a1)
 ; RV32I-NEXT:    lbu a4, 1(a1)
-; RV32I-NEXT:    lbu a7, 31(a0)
-; RV32I-NEXT:    lbu t0, 30(a0)
-; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu t0, 29(a0)
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a1, 0(a0)
@@ -3162,67 +3158,69 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a1, 4(a0)
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 5(a0)
-; RV32I-NEXT:    lbu t4, 6(a0)
-; RV32I-NEXT:    lbu t5, 7(a0)
-; RV32I-NEXT:    lbu t6, 8(a0)
-; RV32I-NEXT:    lbu s0, 9(a0)
-; RV32I-NEXT:    lbu s1, 10(a0)
-; RV32I-NEXT:    lbu s2, 11(a0)
-; RV32I-NEXT:    lbu s3, 12(a0)
-; RV32I-NEXT:    lbu s4, 13(a0)
-; RV32I-NEXT:    lbu s5, 14(a0)
-; RV32I-NEXT:    lbu s6, 15(a0)
-; RV32I-NEXT:    lbu s7, 16(a0)
-; RV32I-NEXT:    lbu s8, 17(a0)
-; RV32I-NEXT:    lbu s9, 18(a0)
-; RV32I-NEXT:    lbu s10, 19(a0)
-; RV32I-NEXT:    lbu s11, 20(a0)
-; RV32I-NEXT:    lbu ra, 21(a0)
-; RV32I-NEXT:    lbu a6, 22(a0)
-; RV32I-NEXT:    lbu a5, 23(a0)
-; RV32I-NEXT:    lbu a4, 24(a0)
-; RV32I-NEXT:    lbu a3, 25(a0)
-; RV32I-NEXT:    lbu a1, 26(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb t0, 58(sp)
-; RV32I-NEXT:    sb t2, 57(sp)
-; RV32I-NEXT:    sb t3, 56(sp)
-; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a1, 54(sp)
-; RV32I-NEXT:    sb a3, 53(sp)
-; RV32I-NEXT:    sb a7, 59(sp)
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    sb a4, 52(sp)
-; RV32I-NEXT:    sb a5, 51(sp)
-; RV32I-NEXT:    sb a6, 50(sp)
-; RV32I-NEXT:    sb ra, 49(sp)
-; RV32I-NEXT:    sb s11, 48(sp)
-; RV32I-NEXT:    sb s10, 47(sp)
-; RV32I-NEXT:    sb s9, 46(sp)
-; RV32I-NEXT:    sb s8, 45(sp)
-; RV32I-NEXT:    sb s7, 44(sp)
-; RV32I-NEXT:    sb s6, 43(sp)
-; RV32I-NEXT:    sb s5, 42(sp)
-; RV32I-NEXT:    sb s4, 41(sp)
-; RV32I-NEXT:    sb s3, 40(sp)
-; RV32I-NEXT:    sb s2, 39(sp)
-; RV32I-NEXT:    sb s1, 38(sp)
-; RV32I-NEXT:    sb s0, 37(sp)
-; RV32I-NEXT:    sb t6, 36(sp)
-; RV32I-NEXT:    sb t5, 35(sp)
-; RV32I-NEXT:    sb t4, 34(sp)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s3, 14(a0)
+; RV32I-NEXT:    lbu s4, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s6, 17(a0)
+; RV32I-NEXT:    lbu s7, 18(a0)
+; RV32I-NEXT:    lbu s8, 19(a0)
+; RV32I-NEXT:    lbu s9, 20(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s11, 22(a0)
+; RV32I-NEXT:    lbu ra, 23(a0)
+; RV32I-NEXT:    lbu a6, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    sb a7, 58(sp)
+; RV32I-NEXT:    sb t0, 57(sp)
+; RV32I-NEXT:    sb a1, 56(sp)
+; RV32I-NEXT:    sb a3, 55(sp)
+; RV32I-NEXT:    sb a4, 54(sp)
+; RV32I-NEXT:    sb a5, 53(sp)
+; RV32I-NEXT:    sb a0, 59(sp)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    sb a6, 52(sp)
+; RV32I-NEXT:    sb ra, 51(sp)
+; RV32I-NEXT:    sb s11, 50(sp)
+; RV32I-NEXT:    sb s10, 49(sp)
+; RV32I-NEXT:    sb s9, 48(sp)
+; RV32I-NEXT:    sb s8, 47(sp)
+; RV32I-NEXT:    sb s7, 46(sp)
+; RV32I-NEXT:    sb s6, 45(sp)
+; RV32I-NEXT:    sb s5, 44(sp)
+; RV32I-NEXT:    sb s4, 43(sp)
+; RV32I-NEXT:    sb s3, 42(sp)
+; RV32I-NEXT:    sb s2, 41(sp)
+; RV32I-NEXT:    sb s1, 40(sp)
+; RV32I-NEXT:    sb s0, 39(sp)
+; RV32I-NEXT:    sb t6, 38(sp)
+; RV32I-NEXT:    sb t5, 37(sp)
+; RV32I-NEXT:    sb t4, 36(sp)
+; RV32I-NEXT:    sb t3, 35(sp)
+; RV32I-NEXT:    sb t2, 34(sp)
 ; RV32I-NEXT:    sb t1, 33(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, a7, 31
+; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 31(sp)
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 30(sp)
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -3265,13 +3263,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    lbu a1, 4(a0)
 ; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
 ; RV32I-NEXT:    lbu a3, 7(a0)
 ; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 8(a0)
 ; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 9(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 9(a0)
 ; RV32I-NEXT:    lbu t1, 10(a0)
 ; RV32I-NEXT:    lbu t2, 11(a0)
 ; RV32I-NEXT:    lbu t3, 12(a0)
@@ -3292,39 +3289,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 27(a0)
 ; RV32I-NEXT:    lbu ra, 28(a0)
 ; RV32I-NEXT:    lbu a5, 29(a0)
-; RV32I-NEXT:    lbu a7, 30(a0)
-; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a6, 30(a0)
+; RV32I-NEXT:    lbu a3, 31(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    lbu a3, 3(a0)
 ; RV32I-NEXT:    or a4, a4, a1
-; RV32I-NEXT:    slli a1, t0, 16
-; RV32I-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    or a4, a1, a4
-; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    lw t0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, t0, a1
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    lbu t1, 1(a0)
-; RV32I-NEXT:    lbu t2, 0(a0)
-; RV32I-NEXT:    lbu a0, 2(a0)
-; RV32I-NEXT:    or t0, t0, a1
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or a1, t1, t2
-; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t1, 0(a0)
+; RV32I-NEXT:    lbu t2, 1(a0)
+; RV32I-NEXT:    or t0, a7, a1
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a1, a0, a7
 ; RV32I-NEXT:    slli t4, t4, 8
 ; RV32I-NEXT:    or a0, t4, t3
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a3, t6, t5
-; RV32I-NEXT:    or t2, a3, a0
+; RV32I-NEXT:    or a7, t6, t5
+; RV32I-NEXT:    or t2, a7, a0
 ; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    slli s2, s2, 16
@@ -3333,15 +3329,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or s0, a0, s0
 ; RV32I-NEXT:    slli s6, s6, 8
 ; RV32I-NEXT:    or a0, s6, s4
-; RV32I-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    andi t3, a3, 7
+; RV32I-NEXT:    lw a7, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t3, a7, 7
 ; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli s8, s8, 24
-; RV32I-NEXT:    or a3, s8, s7
+; RV32I-NEXT:    or a7, s8, s7
 ; RV32I-NEXT:    slli t1, t0, 1
-; RV32I-NEXT:    or t4, a3, a0
-; RV32I-NEXT:    not a3, t3
-; RV32I-NEXT:    sll a0, t1, a3
+; RV32I-NEXT:    or t4, a7, a0
+; RV32I-NEXT:    not a7, t3
+; RV32I-NEXT:    sll a0, t1, a7
 ; RV32I-NEXT:    slli s9, s9, 8
 ; RV32I-NEXT:    or t5, s9, s5
 ; RV32I-NEXT:    slli t6, a4, 1
@@ -3349,24 +3345,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli s11, s11, 24
 ; RV32I-NEXT:    or s1, s11, s10
 ; RV32I-NEXT:    slli t1, s0, 1
-; RV32I-NEXT:    sll t1, t1, a3
+; RV32I-NEXT:    sll t1, t1, a7
 ; RV32I-NEXT:    or t5, s1, t5
 ; RV32I-NEXT:    slli s1, t5, 1
-; RV32I-NEXT:    sll s1, s1, a3
+; RV32I-NEXT:    sll s1, s1, a7
 ; RV32I-NEXT:    xori s2, t3, 31
-; RV32I-NEXT:    sll a3, t6, s2
+; RV32I-NEXT:    sll a7, t6, s2
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, ra
 ; RV32I-NEXT:    slli t6, t2, 1
 ; RV32I-NEXT:    sll t6, t6, s2
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    slli a7, t4, 1
-; RV32I-NEXT:    sll a7, a7, s2
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    slli a6, t4, 1
 ; RV32I-NEXT:    sll a6, a6, s2
+; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    slli a5, a3, 1
+; RV32I-NEXT:    sll a5, a5, s2
 ; RV32I-NEXT:    srl a4, a4, t3
 ; RV32I-NEXT:    srl a1, a1, t3
 ; RV32I-NEXT:    srl t2, t2, t3
@@ -3374,29 +3370,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srl t4, t4, t3
 ; RV32I-NEXT:    srl s0, s0, t3
 ; RV32I-NEXT:    srl t5, t5, t3
-; RV32I-NEXT:    sra a5, a5, t3
+; RV32I-NEXT:    sra a3, a3, t3
 ; RV32I-NEXT:    srli t3, t5, 16
 ; RV32I-NEXT:    sb t3, 26(a2)
-; RV32I-NEXT:    or a6, t5, a6
+; RV32I-NEXT:    or a5, t5, a5
 ; RV32I-NEXT:    sb t5, 24(a2)
 ; RV32I-NEXT:    srli t3, t5, 8
 ; RV32I-NEXT:    sb t3, 25(a2)
-; RV32I-NEXT:    srli t3, a5, 24
+; RV32I-NEXT:    srli t3, a3, 24
 ; RV32I-NEXT:    sb t3, 31(a2)
-; RV32I-NEXT:    srli t3, a5, 16
+; RV32I-NEXT:    srli t3, a3, 16
 ; RV32I-NEXT:    sb t3, 30(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 29(a2)
-; RV32I-NEXT:    srli a5, s0, 16
-; RV32I-NEXT:    sb a5, 18(a2)
-; RV32I-NEXT:    or a5, s0, a7
+; RV32I-NEXT:    sb a3, 28(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 29(a2)
+; RV32I-NEXT:    srli a3, s0, 16
+; RV32I-NEXT:    sb a3, 18(a2)
+; RV32I-NEXT:    or a3, s0, a6
 ; RV32I-NEXT:    sb s0, 16(a2)
 ; RV32I-NEXT:    srli s0, s0, 8
 ; RV32I-NEXT:    sb s0, 17(a2)
-; RV32I-NEXT:    srli a7, t4, 16
-; RV32I-NEXT:    sb a7, 22(a2)
-; RV32I-NEXT:    or a7, t4, s1
+; RV32I-NEXT:    srli a6, t4, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    or a6, t4, s1
 ; RV32I-NEXT:    sb t4, 20(a2)
 ; RV32I-NEXT:    srli t3, t4, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
@@ -3414,7 +3410,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t1, 13(a2)
 ; RV32I-NEXT:    srli t1, a1, 16
 ; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
@@ -3424,18 +3420,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 27(a2)
 ; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 19(a2)
-; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a5, 27(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 19(a2)
+; RV32I-NEXT:    srli a1, a6, 24
 ; RV32I-NEXT:    sb a1, 23(a2)
 ; RV32I-NEXT:    srli a1, t3, 24
 ; RV32I-NEXT:    sb a1, 11(a2)
 ; RV32I-NEXT:    srli a1, t0, 24
 ; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload



More information about the llvm-commits mailing list