[llvm] [RISCV] Enable load clustering by default (PR #73789)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 29 04:27:55 PST 2023


https://github.com/asb created https://github.com/llvm/llvm-project/pull/73789

[RISCV] Enable load clustering by default

Also tweaks the heuristic to cluster if operations are within a cache
line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does
something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not
sure if that's intentionally set to 512 bytes or if the division is in
error.

Posting for comment and for people to test on their workloads, feedback
on ideas for a tweaked heuristic etc.

Stacks on top of #73778.


>From ecc1ca6822882fb4a4287ae1eaad1896e1d52ff5 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 10:38:57 +0000
Subject: [PATCH 1/5] [MachineScheduler][NFCI] Add Offset and OffsetIsScalable
 args to shouldClusterMemOps

These are picked up from getMemOperandsWithOffsetWidth but weren't then
being passed through to shouldClusterMemOps, which forces backends to
collect the information again if they want to use the kind of heuristics
typically used for the similar shouldScheduleLoadsNear function (e.g.
checking the offset is within 1 cache line).

This patch just adds the parameters, but doesn't attempt to use them.
There is an opportunity to use them in the current PPC and AArch64
shouldClusterMemOps implementation, and I intend to use the offset in
the heuristic for RISC-V. I've left these for future patches in the
interest of being as incremental as possible.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h    |  2 ++
 llvm/lib/CodeGen/MachineScheduler.cpp          | 14 +++++++++-----
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp   |  5 +++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.h     |  2 ++
 llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp |  5 ++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp         |  2 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h           |  2 ++
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp       |  5 +++--
 llvm/lib/Target/PowerPC/PPCInstrInfo.h         |  2 ++
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp       |  5 +++--
 llvm/lib/Target/RISCV/RISCVInstrInfo.h         |  2 ++
 11 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 665b7449ddb820a..b7dc56f93c739d9 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1501,7 +1501,9 @@ class TargetInstrInfo : public MCInstrInfo {
   /// \p NumBytes is the number of bytes that will be loaded from all the
   /// clustered loads if this hook returns true.
   virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                   int64_t Offset1, bool OffsetIsScalable1,
                                    ArrayRef<const MachineOperand *> BaseOps2,
+                                   int64_t Offset2, bool OffsetIsScalable2,
                                    unsigned ClusterSize,
                                    unsigned NumBytes) const {
     llvm_unreachable("target did not implement shouldClusterMemOps()");
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4add33ba0996af0..cd5fe71ef0c1ad1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1698,11 +1698,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
     unsigned Width;
+    bool OffsetIsScalable;
 
     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
-              int64_t Offset, unsigned Width)
+              int64_t Offset, bool OffsetIsScalable, unsigned Width)
         : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
-          Width(Width) {}
+          OffsetIsScalable(OffsetIsScalable), Width(Width) {}
 
     static bool Compare(const MachineOperand *const &A,
                         const MachineOperand *const &B) {
@@ -1831,8 +1832,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
           SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
     }
 
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
-                                  CurrentClusterBytes))
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpa.Offset,
+                                  MemOpa.OffsetIsScalable, MemOpb.BaseOps,
+                                  MemOpb.Offset, MemOpb.OffsetIsScalable,
+                                  ClusterLength, CurrentClusterBytes))
       continue;
 
     SUnit *SUa = MemOpa.SU;
@@ -1899,7 +1902,8 @@ void BaseMemOpClusterMutation::collectMemOpRecords(
     unsigned Width;
     if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
                                            OffsetIsScalable, Width, TRI)) {
-      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+      MemOpRecords.push_back(
+          MemOpInfo(&SU, BaseOps, Offset, OffsetIsScalable, Width));
 
       LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
                         << Offset << ", OffsetIsScalable: " << OffsetIsScalable
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e97f17e3f49c587..6b49e17528ada74 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4230,8 +4230,9 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
 ///
 /// Only called for LdSt for which getMemOperandWithOffset returns true.
 bool AArch64InstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
   const MachineOperand &BaseOp1 = *BaseOps1.front();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index cc588cdad6b8e5a..65e5fb49536da24 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -179,7 +179,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                            int64_t &MinOffset, int64_t &MaxOffset);
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 50f8ad4433c6d5c..442ae4dd7b34fe1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -232,7 +232,10 @@ class SIInsertHardClauses : public MachineFunctionPass {
               // scheduler it limits the size of the cluster to avoid increasing
               // register pressure too much, but this pass runs after register
               // allocation so there is no need for that kind of limit.
-              !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+              // We also lie about the Offset and OffsetIsScalable parameters,
+              // as they aren't used in the SIInstrInfo implementation.
+              !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false,
+                                        2, 2)))) {
           // Finish the current clause.
           Changed |= emitClause(CI, SII);
           CI = ClauseInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d6912544f..0a06fa88b6b1025 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -541,7 +541,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 }
 
 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                      int64_t Offset1, bool OffsetIsScalable1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
+                                      int64_t Offset2, bool OffsetIsScalable2,
                                       unsigned ClusterSize,
                                       unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index eba817756e9c58e..6da4f74dfe5f3ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -234,7 +234,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       const TargetRegisterInfo *TRI) const final;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 6784049348b1638..0de795d9d5e812e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2878,8 +2878,9 @@ static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
 }
 
 bool PPCInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
 
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 31e9859a41739a1..0c9ad607418ecc7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -530,7 +530,9 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   /// Returns true if the two given memory operations should be scheduled
   /// adjacent.
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2918e5654db4f9f..f596a4cd3752897 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2266,8 +2266,9 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 }
 
 bool RISCVInstrInfo::shouldClusterMemOps(
-    ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned ClusterSize,
+    ArrayRef<const MachineOperand *> BaseOps1, int64_t Offset1,
+    bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
+    int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize,
     unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
   // should not be clustered
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0954286a419bdd5..7e1d3f31180650d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -158,7 +158,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
       const TargetRegisterInfo *TRI) const override;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           int64_t Offset1, bool OffsetIsScalable1,
                            ArrayRef<const MachineOperand *> BaseOps2,
+                           int64_t Offset2, bool OffsetIsScalable2,
                            unsigned ClusterSize,
                            unsigned NumBytes) const override;
 

>From 5a1d0a9e7a893ae90d7b71cc4b819ef75487d23f Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 11:09:20 +0000
Subject: [PATCH 2/5] Add missing doc comment change

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index b7dc56f93c739d9..afff4e5894e8541 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1496,6 +1496,9 @@ class TargetInstrInfo : public MCInstrInfo {
   /// to TargetPassConfig::createMachineScheduler() to have an effect.
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
+  /// \p Offset1 and \p Offset2 are the byte offsets for the memory
+  /// operations, while \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate
+  /// if the offset is scaled gby a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store
   /// cluster if this hook returns true.
   /// \p NumBytes is the number of bytes that will be loaded from all the

>From 2cada4e7e8754f556d7c8a9b2120a7f7747c981a Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 11:12:17 +0000
Subject: [PATCH 3/5] Note that Offset2 will never be less than Offset1

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index afff4e5894e8541..ba7f70d66a6c0d8 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1497,8 +1497,10 @@ class TargetInstrInfo : public MCInstrInfo {
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
   /// \p Offset1 and \p Offset2 are the byte offsets for the memory
-  /// operations, while \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate
-  /// if the offset is scaled gby a runtime quantity.
+  /// operations, and \p Offset2 is guaranteed to be greater than or equal to
+  /// Offset1.
+  /// \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate if the offset is
+  /// scaled by a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store
   /// cluster if this hook returns true.
   /// \p NumBytes is the number of bytes that will be loaded from all the

>From 3b552e80d6bd43736abba4de388271628b9c74af Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 12:12:39 +0000
Subject: [PATCH 4/5] Actually Offset2 may be less than Offset1

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index ba7f70d66a6c0d8..4ec46f9bde1adf2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1497,8 +1497,7 @@ class TargetInstrInfo : public MCInstrInfo {
   ///
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
   /// \p Offset1 and \p Offset2 are the byte offsets for the memory
-  /// operations, and \p Offset2 is guaranteed to be greater than or equal to
-  /// Offset1.
+  /// operations.
   /// \p OffsetIsScalable1 and \p OffsetIsScalable2 indicate if the offset is
   /// scaled by a runtime quantity.
   /// \p ClusterSize is the number of operations in the resulting load/store

>From 7a50880c97170274e5fbe750f09a54ec67b5a6cd Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 29 Nov 2023 12:24:27 +0000
Subject: [PATCH 5/5] [RISCV] Enable load clustering by default

Also tweaks the heuristic to cluster if operations are within a cache
line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does
something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not
sure if that's intentionally set to 512 bytes or if the division is in
error.

Posting for comment and for people to test on their workloads, feedback
on ideas for a tweaked heuristic etc.

Stacks on top of #73778.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    6 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   16 +-
 llvm/test/CodeGen/RISCV/add-before-shl.ll     |   26 +-
 .../test/CodeGen/RISCV/callee-saved-fpr32s.ll |  720 ++--
 .../test/CodeGen/RISCV/callee-saved-fpr64s.ll |  496 +--
 llvm/test/CodeGen/RISCV/callee-saved-gprs.ll  | 1044 ++---
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll |   88 +-
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |   44 +-
 llvm/test/CodeGen/RISCV/iabs.ll               |  104 +-
 llvm/test/CodeGen/RISCV/idiv_large.ll         |    6 +-
 .../test/CodeGen/RISCV/intrinsic-cttz-elts.ll |   16 +-
 llvm/test/CodeGen/RISCV/legalize-fneg.ll      |    8 +-
 llvm/test/CodeGen/RISCV/llvm.exp10.ll         |   26 +-
 llvm/test/CodeGen/RISCV/mul.ll                |   60 +-
 llvm/test/CodeGen/RISCV/nontemporal.ll        | 1420 +++---
 llvm/test/CodeGen/RISCV/push-pop-popret.ll    | 2004 ++++-----
 .../test/CodeGen/RISCV/reduction-formation.ll |   72 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |   44 +-
 llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll   |    4 +-
 llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll    |   44 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-elen.ll   |   34 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   22 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |   28 +-
 .../fixed-vectors-strided-load-store-asm.ll   |   16 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |  264 +-
 llvm/test/CodeGen/RISCV/rvv/pr63596.ll        |   12 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  500 +--
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |  182 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |  214 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |   98 +-
 .../CodeGen/RISCV/unaligned-load-store.ll     |   57 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |  166 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 2138 +++++-----
 .../RISCV/wide-scalar-shift-legalization.ll   | 3794 +++++++++--------
 llvm/test/CodeGen/RISCV/xtheadmempair.ll      |   14 +-
 35 files changed, 6914 insertions(+), 6873 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index f596a4cd3752897..cf399f37d255c29 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2282,9 +2282,9 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  // A cache line is typically 64 bytes, so cluster if the memory ops are on
+  // the same or a neighbouring cache line.
+  return std::abs(Offset1 - Offset2) < 64;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b6c194f03f54209..0954fbb8314c6fa 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -95,11 +95,6 @@ static cl::opt<bool>
                         cl::desc("Enable Split RegisterAlloc for RVV"),
                         cl::init(false));
 
-static cl::opt<bool> EnableMISchedLoadClustering(
-    "riscv-misched-load-clustering", cl::Hidden,
-    cl::desc("Enable load clustering in the machine scheduler"),
-    cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -350,15 +345,10 @@ class RISCVPassConfig : public TargetPassConfig {
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
-    ScheduleDAGMILive *DAG = nullptr;
-    if (EnableMISchedLoadClustering) {
-      DAG = createGenericSchedLive(C);
-      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-    }
-    if (ST.hasMacroFusion()) {
-      DAG = DAG ? DAG : createGenericSchedLive(C);
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    if (ST.hasMacroFusion())
       DAG->addMutation(createRISCVMacroFusionDAGMutation());
-    }
     return DAG;
   }
 
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index 274f1cef49aa955..3695a8a7f60862f 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -200,25 +200,25 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    lw a6, 4(a1)
+; RV32C-NEXT:    lw a6, 8(a1)
 ; RV32C-NEXT:    c.lw a3, 12(a1)
-; RV32C-NEXT:    c.lw a4, 0(a1)
-; RV32C-NEXT:    c.lw a1, 8(a1)
+; RV32C-NEXT:    c.lw a2, 4(a1)
+; RV32C-NEXT:    c.lw a1, 0(a1)
 ; RV32C-NEXT:    c.lui a5, 16
 ; RV32C-NEXT:    c.add a3, a5
 ; RV32C-NEXT:    c.slli a3, 3
-; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a3, a5
-; RV32C-NEXT:    srli a5, a4, 29
-; RV32C-NEXT:    slli a2, a6, 3
-; RV32C-NEXT:    c.or a2, a5
 ; RV32C-NEXT:    srli a5, a6, 29
+; RV32C-NEXT:    c.or a3, a5
+; RV32C-NEXT:    srli a5, a1, 29
+; RV32C-NEXT:    slli a4, a2, 3
+; RV32C-NEXT:    c.or a4, a5
+; RV32C-NEXT:    c.srli a2, 29
+; RV32C-NEXT:    c.slli a6, 3
+; RV32C-NEXT:    or a2, a6, a2
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a5
-; RV32C-NEXT:    c.slli a4, 3
-; RV32C-NEXT:    c.sw a4, 0(a0)
-; RV32C-NEXT:    c.sw a1, 8(a0)
-; RV32C-NEXT:    c.sw a2, 4(a0)
+; RV32C-NEXT:    c.sw a1, 0(a0)
+; RV32C-NEXT:    c.sw a2, 8(a0)
+; RV32C-NEXT:    c.sw a4, 4(a0)
 ; RV32C-NEXT:    c.sw a3, 12(a0)
 ; RV32C-NEXT:    c.jr ra
 ;
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 7111316931f19b7..7aeaaab68a208cb 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -25,69 +25,69 @@ define void @callee() nounwind {
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
 ; ILP32-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    flw fa1, 16(a1)
-; ILP32-NEXT:    flw fa0, 20(a1)
-; ILP32-NEXT:    flw ft0, 24(a1)
-; ILP32-NEXT:    flw ft1, 28(a1)
-; ILP32-NEXT:    flw ft2, 32(a1)
-; ILP32-NEXT:    flw ft3, 36(a1)
-; ILP32-NEXT:    flw ft4, 40(a1)
-; ILP32-NEXT:    flw ft5, 44(a1)
-; ILP32-NEXT:    flw ft6, 48(a1)
-; ILP32-NEXT:    flw ft7, 52(a1)
-; ILP32-NEXT:    flw fa6, 56(a1)
-; ILP32-NEXT:    flw fa7, 60(a1)
-; ILP32-NEXT:    flw ft8, 64(a1)
-; ILP32-NEXT:    flw ft9, 68(a1)
-; ILP32-NEXT:    flw ft10, 72(a1)
-; ILP32-NEXT:    flw ft11, 76(a1)
-; ILP32-NEXT:    flw fs0, 80(a1)
-; ILP32-NEXT:    flw fs1, 84(a1)
-; ILP32-NEXT:    flw fs2, 88(a1)
-; ILP32-NEXT:    flw fs3, 92(a1)
-; ILP32-NEXT:    flw fs4, 96(a1)
-; ILP32-NEXT:    flw fs5, 100(a1)
-; ILP32-NEXT:    flw fs6, 104(a1)
-; ILP32-NEXT:    flw fs7, 108(a1)
+; ILP32-NEXT:    flw fa4, 16(a1)
+; ILP32-NEXT:    flw fa3, 20(a1)
+; ILP32-NEXT:    flw fa2, 24(a1)
+; ILP32-NEXT:    flw fa1, 28(a1)
+; ILP32-NEXT:    flw fa0, 32(a1)
+; ILP32-NEXT:    flw ft0, 36(a1)
+; ILP32-NEXT:    flw ft1, 40(a1)
+; ILP32-NEXT:    flw ft2, 44(a1)
+; ILP32-NEXT:    flw ft3, 48(a1)
+; ILP32-NEXT:    flw ft4, 52(a1)
+; ILP32-NEXT:    flw ft5, 56(a1)
+; ILP32-NEXT:    flw ft6, 60(a1)
+; ILP32-NEXT:    flw ft7, 64(a1)
+; ILP32-NEXT:    flw fa6, 68(a1)
+; ILP32-NEXT:    flw fa7, 72(a1)
+; ILP32-NEXT:    flw ft8, 76(a1)
+; ILP32-NEXT:    flw ft9, 80(a1)
+; ILP32-NEXT:    flw ft10, 84(a1)
+; ILP32-NEXT:    flw ft11, 88(a1)
+; ILP32-NEXT:    flw fs0, 92(a1)
+; ILP32-NEXT:    flw fs1, 96(a1)
+; ILP32-NEXT:    flw fs2, 100(a1)
+; ILP32-NEXT:    flw fs3, 104(a1)
+; ILP32-NEXT:    flw fs4, 108(a1)
+; ILP32-NEXT:    flw fs5, 112(a1)
+; ILP32-NEXT:    flw fs6, 116(a1)
+; ILP32-NEXT:    flw fs7, 120(a1)
 ; ILP32-NEXT:    flw fs8, 124(a1)
-; ILP32-NEXT:    flw fs9, 120(a1)
-; ILP32-NEXT:    flw fs10, 116(a1)
-; ILP32-NEXT:    flw fs11, 112(a1)
+; ILP32-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32-NEXT:    fsw fs8, 124(a1)
-; ILP32-NEXT:    fsw fs9, 120(a1)
-; ILP32-NEXT:    fsw fs10, 116(a1)
-; ILP32-NEXT:    fsw fs11, 112(a1)
-; ILP32-NEXT:    fsw fs7, 108(a1)
-; ILP32-NEXT:    fsw fs6, 104(a1)
-; ILP32-NEXT:    fsw fs5, 100(a1)
-; ILP32-NEXT:    fsw fs4, 96(a1)
-; ILP32-NEXT:    fsw fs3, 92(a1)
-; ILP32-NEXT:    fsw fs2, 88(a1)
-; ILP32-NEXT:    fsw fs1, 84(a1)
-; ILP32-NEXT:    fsw fs0, 80(a1)
-; ILP32-NEXT:    fsw ft11, 76(a1)
-; ILP32-NEXT:    fsw ft10, 72(a1)
-; ILP32-NEXT:    fsw ft9, 68(a1)
-; ILP32-NEXT:    fsw ft8, 64(a1)
-; ILP32-NEXT:    fsw fa7, 60(a1)
-; ILP32-NEXT:    fsw fa6, 56(a1)
-; ILP32-NEXT:    fsw ft7, 52(a1)
-; ILP32-NEXT:    fsw ft6, 48(a1)
-; ILP32-NEXT:    fsw ft5, 44(a1)
-; ILP32-NEXT:    fsw ft4, 40(a1)
-; ILP32-NEXT:    fsw ft3, 36(a1)
-; ILP32-NEXT:    fsw ft2, 32(a1)
-; ILP32-NEXT:    fsw ft1, 28(a1)
-; ILP32-NEXT:    fsw ft0, 24(a1)
-; ILP32-NEXT:    fsw fa0, 20(a1)
-; ILP32-NEXT:    fsw fa1, 16(a1)
-; ILP32-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32-NEXT:    fsw fs7, 120(a1)
+; ILP32-NEXT:    fsw fs6, 116(a1)
+; ILP32-NEXT:    fsw fs5, 112(a1)
+; ILP32-NEXT:    fsw fs4, 108(a1)
+; ILP32-NEXT:    fsw fs3, 104(a1)
+; ILP32-NEXT:    fsw fs2, 100(a1)
+; ILP32-NEXT:    fsw fs1, 96(a1)
+; ILP32-NEXT:    fsw fs0, 92(a1)
+; ILP32-NEXT:    fsw ft11, 88(a1)
+; ILP32-NEXT:    fsw ft10, 84(a1)
+; ILP32-NEXT:    fsw ft9, 80(a1)
+; ILP32-NEXT:    fsw ft8, 76(a1)
+; ILP32-NEXT:    fsw fa7, 72(a1)
+; ILP32-NEXT:    fsw fa6, 68(a1)
+; ILP32-NEXT:    fsw ft7, 64(a1)
+; ILP32-NEXT:    fsw ft6, 60(a1)
+; ILP32-NEXT:    fsw ft5, 56(a1)
+; ILP32-NEXT:    fsw ft4, 52(a1)
+; ILP32-NEXT:    fsw ft3, 48(a1)
+; ILP32-NEXT:    fsw ft2, 44(a1)
+; ILP32-NEXT:    fsw ft1, 40(a1)
+; ILP32-NEXT:    fsw ft0, 36(a1)
+; ILP32-NEXT:    fsw fa0, 32(a1)
+; ILP32-NEXT:    fsw fa1, 28(a1)
+; ILP32-NEXT:    fsw fa2, 24(a1)
+; ILP32-NEXT:    fsw fa3, 20(a1)
+; ILP32-NEXT:    fsw fa4, 16(a1)
+; ILP32-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
@@ -95,69 +95,69 @@ define void @callee() nounwind {
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
 ; LP64-NEXT:    flw fa5, %lo(var)(a0)
-; LP64-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    flw fa1, 16(a1)
-; LP64-NEXT:    flw fa0, 20(a1)
-; LP64-NEXT:    flw ft0, 24(a1)
-; LP64-NEXT:    flw ft1, 28(a1)
-; LP64-NEXT:    flw ft2, 32(a1)
-; LP64-NEXT:    flw ft3, 36(a1)
-; LP64-NEXT:    flw ft4, 40(a1)
-; LP64-NEXT:    flw ft5, 44(a1)
-; LP64-NEXT:    flw ft6, 48(a1)
-; LP64-NEXT:    flw ft7, 52(a1)
-; LP64-NEXT:    flw fa6, 56(a1)
-; LP64-NEXT:    flw fa7, 60(a1)
-; LP64-NEXT:    flw ft8, 64(a1)
-; LP64-NEXT:    flw ft9, 68(a1)
-; LP64-NEXT:    flw ft10, 72(a1)
-; LP64-NEXT:    flw ft11, 76(a1)
-; LP64-NEXT:    flw fs0, 80(a1)
-; LP64-NEXT:    flw fs1, 84(a1)
-; LP64-NEXT:    flw fs2, 88(a1)
-; LP64-NEXT:    flw fs3, 92(a1)
-; LP64-NEXT:    flw fs4, 96(a1)
-; LP64-NEXT:    flw fs5, 100(a1)
-; LP64-NEXT:    flw fs6, 104(a1)
-; LP64-NEXT:    flw fs7, 108(a1)
+; LP64-NEXT:    flw fa4, 16(a1)
+; LP64-NEXT:    flw fa3, 20(a1)
+; LP64-NEXT:    flw fa2, 24(a1)
+; LP64-NEXT:    flw fa1, 28(a1)
+; LP64-NEXT:    flw fa0, 32(a1)
+; LP64-NEXT:    flw ft0, 36(a1)
+; LP64-NEXT:    flw ft1, 40(a1)
+; LP64-NEXT:    flw ft2, 44(a1)
+; LP64-NEXT:    flw ft3, 48(a1)
+; LP64-NEXT:    flw ft4, 52(a1)
+; LP64-NEXT:    flw ft5, 56(a1)
+; LP64-NEXT:    flw ft6, 60(a1)
+; LP64-NEXT:    flw ft7, 64(a1)
+; LP64-NEXT:    flw fa6, 68(a1)
+; LP64-NEXT:    flw fa7, 72(a1)
+; LP64-NEXT:    flw ft8, 76(a1)
+; LP64-NEXT:    flw ft9, 80(a1)
+; LP64-NEXT:    flw ft10, 84(a1)
+; LP64-NEXT:    flw ft11, 88(a1)
+; LP64-NEXT:    flw fs0, 92(a1)
+; LP64-NEXT:    flw fs1, 96(a1)
+; LP64-NEXT:    flw fs2, 100(a1)
+; LP64-NEXT:    flw fs3, 104(a1)
+; LP64-NEXT:    flw fs4, 108(a1)
+; LP64-NEXT:    flw fs5, 112(a1)
+; LP64-NEXT:    flw fs6, 116(a1)
+; LP64-NEXT:    flw fs7, 120(a1)
 ; LP64-NEXT:    flw fs8, 124(a1)
-; LP64-NEXT:    flw fs9, 120(a1)
-; LP64-NEXT:    flw fs10, 116(a1)
-; LP64-NEXT:    flw fs11, 112(a1)
+; LP64-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64-NEXT:    fsw fs8, 124(a1)
-; LP64-NEXT:    fsw fs9, 120(a1)
-; LP64-NEXT:    fsw fs10, 116(a1)
-; LP64-NEXT:    fsw fs11, 112(a1)
-; LP64-NEXT:    fsw fs7, 108(a1)
-; LP64-NEXT:    fsw fs6, 104(a1)
-; LP64-NEXT:    fsw fs5, 100(a1)
-; LP64-NEXT:    fsw fs4, 96(a1)
-; LP64-NEXT:    fsw fs3, 92(a1)
-; LP64-NEXT:    fsw fs2, 88(a1)
-; LP64-NEXT:    fsw fs1, 84(a1)
-; LP64-NEXT:    fsw fs0, 80(a1)
-; LP64-NEXT:    fsw ft11, 76(a1)
-; LP64-NEXT:    fsw ft10, 72(a1)
-; LP64-NEXT:    fsw ft9, 68(a1)
-; LP64-NEXT:    fsw ft8, 64(a1)
-; LP64-NEXT:    fsw fa7, 60(a1)
-; LP64-NEXT:    fsw fa6, 56(a1)
-; LP64-NEXT:    fsw ft7, 52(a1)
-; LP64-NEXT:    fsw ft6, 48(a1)
-; LP64-NEXT:    fsw ft5, 44(a1)
-; LP64-NEXT:    fsw ft4, 40(a1)
-; LP64-NEXT:    fsw ft3, 36(a1)
-; LP64-NEXT:    fsw ft2, 32(a1)
-; LP64-NEXT:    fsw ft1, 28(a1)
-; LP64-NEXT:    fsw ft0, 24(a1)
-; LP64-NEXT:    fsw fa0, 20(a1)
-; LP64-NEXT:    fsw fa1, 16(a1)
-; LP64-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64-NEXT:    fsw fs7, 120(a1)
+; LP64-NEXT:    fsw fs6, 116(a1)
+; LP64-NEXT:    fsw fs5, 112(a1)
+; LP64-NEXT:    fsw fs4, 108(a1)
+; LP64-NEXT:    fsw fs3, 104(a1)
+; LP64-NEXT:    fsw fs2, 100(a1)
+; LP64-NEXT:    fsw fs1, 96(a1)
+; LP64-NEXT:    fsw fs0, 92(a1)
+; LP64-NEXT:    fsw ft11, 88(a1)
+; LP64-NEXT:    fsw ft10, 84(a1)
+; LP64-NEXT:    fsw ft9, 80(a1)
+; LP64-NEXT:    fsw ft8, 76(a1)
+; LP64-NEXT:    fsw fa7, 72(a1)
+; LP64-NEXT:    fsw fa6, 68(a1)
+; LP64-NEXT:    fsw ft7, 64(a1)
+; LP64-NEXT:    fsw ft6, 60(a1)
+; LP64-NEXT:    fsw ft5, 56(a1)
+; LP64-NEXT:    fsw ft4, 52(a1)
+; LP64-NEXT:    fsw ft3, 48(a1)
+; LP64-NEXT:    fsw ft2, 44(a1)
+; LP64-NEXT:    fsw ft1, 40(a1)
+; LP64-NEXT:    fsw ft0, 36(a1)
+; LP64-NEXT:    fsw fa0, 32(a1)
+; LP64-NEXT:    fsw fa1, 28(a1)
+; LP64-NEXT:    fsw fa2, 24(a1)
+; LP64-NEXT:    fsw fa3, 20(a1)
+; LP64-NEXT:    fsw fa4, 16(a1)
+; LP64-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64-NEXT:    ret
 ;
@@ -178,69 +178,69 @@ define void @callee() nounwind {
 ; ILP32F-NEXT:    fsw fs11, 0(sp) # 4-byte Folded Spill
 ; ILP32F-NEXT:    lui a0, %hi(var)
 ; ILP32F-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32F-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32F-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32F-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32F-NEXT:    addi a1, a0, %lo(var)
-; ILP32F-NEXT:    flw fa1, 16(a1)
-; ILP32F-NEXT:    flw fa0, 20(a1)
-; ILP32F-NEXT:    flw ft0, 24(a1)
-; ILP32F-NEXT:    flw ft1, 28(a1)
-; ILP32F-NEXT:    flw ft2, 32(a1)
-; ILP32F-NEXT:    flw ft3, 36(a1)
-; ILP32F-NEXT:    flw ft4, 40(a1)
-; ILP32F-NEXT:    flw ft5, 44(a1)
-; ILP32F-NEXT:    flw ft6, 48(a1)
-; ILP32F-NEXT:    flw ft7, 52(a1)
-; ILP32F-NEXT:    flw fa6, 56(a1)
-; ILP32F-NEXT:    flw fa7, 60(a1)
-; ILP32F-NEXT:    flw ft8, 64(a1)
-; ILP32F-NEXT:    flw ft9, 68(a1)
-; ILP32F-NEXT:    flw ft10, 72(a1)
-; ILP32F-NEXT:    flw ft11, 76(a1)
-; ILP32F-NEXT:    flw fs0, 80(a1)
-; ILP32F-NEXT:    flw fs1, 84(a1)
-; ILP32F-NEXT:    flw fs2, 88(a1)
-; ILP32F-NEXT:    flw fs3, 92(a1)
-; ILP32F-NEXT:    flw fs4, 96(a1)
-; ILP32F-NEXT:    flw fs5, 100(a1)
-; ILP32F-NEXT:    flw fs6, 104(a1)
-; ILP32F-NEXT:    flw fs7, 108(a1)
+; ILP32F-NEXT:    flw fa4, 16(a1)
+; ILP32F-NEXT:    flw fa3, 20(a1)
+; ILP32F-NEXT:    flw fa2, 24(a1)
+; ILP32F-NEXT:    flw fa1, 28(a1)
+; ILP32F-NEXT:    flw fa0, 32(a1)
+; ILP32F-NEXT:    flw ft0, 36(a1)
+; ILP32F-NEXT:    flw ft1, 40(a1)
+; ILP32F-NEXT:    flw ft2, 44(a1)
+; ILP32F-NEXT:    flw ft3, 48(a1)
+; ILP32F-NEXT:    flw ft4, 52(a1)
+; ILP32F-NEXT:    flw ft5, 56(a1)
+; ILP32F-NEXT:    flw ft6, 60(a1)
+; ILP32F-NEXT:    flw ft7, 64(a1)
+; ILP32F-NEXT:    flw fa6, 68(a1)
+; ILP32F-NEXT:    flw fa7, 72(a1)
+; ILP32F-NEXT:    flw ft8, 76(a1)
+; ILP32F-NEXT:    flw ft9, 80(a1)
+; ILP32F-NEXT:    flw ft10, 84(a1)
+; ILP32F-NEXT:    flw ft11, 88(a1)
+; ILP32F-NEXT:    flw fs0, 92(a1)
+; ILP32F-NEXT:    flw fs1, 96(a1)
+; ILP32F-NEXT:    flw fs2, 100(a1)
+; ILP32F-NEXT:    flw fs3, 104(a1)
+; ILP32F-NEXT:    flw fs4, 108(a1)
+; ILP32F-NEXT:    flw fs5, 112(a1)
+; ILP32F-NEXT:    flw fs6, 116(a1)
+; ILP32F-NEXT:    flw fs7, 120(a1)
 ; ILP32F-NEXT:    flw fs8, 124(a1)
-; ILP32F-NEXT:    flw fs9, 120(a1)
-; ILP32F-NEXT:    flw fs10, 116(a1)
-; ILP32F-NEXT:    flw fs11, 112(a1)
+; ILP32F-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32F-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32F-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32F-NEXT:    fsw fs8, 124(a1)
-; ILP32F-NEXT:    fsw fs9, 120(a1)
-; ILP32F-NEXT:    fsw fs10, 116(a1)
-; ILP32F-NEXT:    fsw fs11, 112(a1)
-; ILP32F-NEXT:    fsw fs7, 108(a1)
-; ILP32F-NEXT:    fsw fs6, 104(a1)
-; ILP32F-NEXT:    fsw fs5, 100(a1)
-; ILP32F-NEXT:    fsw fs4, 96(a1)
-; ILP32F-NEXT:    fsw fs3, 92(a1)
-; ILP32F-NEXT:    fsw fs2, 88(a1)
-; ILP32F-NEXT:    fsw fs1, 84(a1)
-; ILP32F-NEXT:    fsw fs0, 80(a1)
-; ILP32F-NEXT:    fsw ft11, 76(a1)
-; ILP32F-NEXT:    fsw ft10, 72(a1)
-; ILP32F-NEXT:    fsw ft9, 68(a1)
-; ILP32F-NEXT:    fsw ft8, 64(a1)
-; ILP32F-NEXT:    fsw fa7, 60(a1)
-; ILP32F-NEXT:    fsw fa6, 56(a1)
-; ILP32F-NEXT:    fsw ft7, 52(a1)
-; ILP32F-NEXT:    fsw ft6, 48(a1)
-; ILP32F-NEXT:    fsw ft5, 44(a1)
-; ILP32F-NEXT:    fsw ft4, 40(a1)
-; ILP32F-NEXT:    fsw ft3, 36(a1)
-; ILP32F-NEXT:    fsw ft2, 32(a1)
-; ILP32F-NEXT:    fsw ft1, 28(a1)
-; ILP32F-NEXT:    fsw ft0, 24(a1)
-; ILP32F-NEXT:    fsw fa0, 20(a1)
-; ILP32F-NEXT:    fsw fa1, 16(a1)
-; ILP32F-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32F-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32F-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32F-NEXT:    fsw fs7, 120(a1)
+; ILP32F-NEXT:    fsw fs6, 116(a1)
+; ILP32F-NEXT:    fsw fs5, 112(a1)
+; ILP32F-NEXT:    fsw fs4, 108(a1)
+; ILP32F-NEXT:    fsw fs3, 104(a1)
+; ILP32F-NEXT:    fsw fs2, 100(a1)
+; ILP32F-NEXT:    fsw fs1, 96(a1)
+; ILP32F-NEXT:    fsw fs0, 92(a1)
+; ILP32F-NEXT:    fsw ft11, 88(a1)
+; ILP32F-NEXT:    fsw ft10, 84(a1)
+; ILP32F-NEXT:    fsw ft9, 80(a1)
+; ILP32F-NEXT:    fsw ft8, 76(a1)
+; ILP32F-NEXT:    fsw fa7, 72(a1)
+; ILP32F-NEXT:    fsw fa6, 68(a1)
+; ILP32F-NEXT:    fsw ft7, 64(a1)
+; ILP32F-NEXT:    fsw ft6, 60(a1)
+; ILP32F-NEXT:    fsw ft5, 56(a1)
+; ILP32F-NEXT:    fsw ft4, 52(a1)
+; ILP32F-NEXT:    fsw ft3, 48(a1)
+; ILP32F-NEXT:    fsw ft2, 44(a1)
+; ILP32F-NEXT:    fsw ft1, 40(a1)
+; ILP32F-NEXT:    fsw ft0, 36(a1)
+; ILP32F-NEXT:    fsw fa0, 32(a1)
+; ILP32F-NEXT:    fsw fa1, 28(a1)
+; ILP32F-NEXT:    fsw fa2, 24(a1)
+; ILP32F-NEXT:    fsw fa3, 20(a1)
+; ILP32F-NEXT:    fsw fa4, 16(a1)
+; ILP32F-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32F-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32F-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32F-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32F-NEXT:    flw fs0, 44(sp) # 4-byte Folded Reload
 ; ILP32F-NEXT:    flw fs1, 40(sp) # 4-byte Folded Reload
@@ -274,69 +274,69 @@ define void @callee() nounwind {
 ; LP64F-NEXT:    fsw fs11, 0(sp) # 4-byte Folded Spill
 ; LP64F-NEXT:    lui a0, %hi(var)
 ; LP64F-NEXT:    flw fa5, %lo(var)(a0)
-; LP64F-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64F-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64F-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64F-NEXT:    addi a1, a0, %lo(var)
-; LP64F-NEXT:    flw fa1, 16(a1)
-; LP64F-NEXT:    flw fa0, 20(a1)
-; LP64F-NEXT:    flw ft0, 24(a1)
-; LP64F-NEXT:    flw ft1, 28(a1)
-; LP64F-NEXT:    flw ft2, 32(a1)
-; LP64F-NEXT:    flw ft3, 36(a1)
-; LP64F-NEXT:    flw ft4, 40(a1)
-; LP64F-NEXT:    flw ft5, 44(a1)
-; LP64F-NEXT:    flw ft6, 48(a1)
-; LP64F-NEXT:    flw ft7, 52(a1)
-; LP64F-NEXT:    flw fa6, 56(a1)
-; LP64F-NEXT:    flw fa7, 60(a1)
-; LP64F-NEXT:    flw ft8, 64(a1)
-; LP64F-NEXT:    flw ft9, 68(a1)
-; LP64F-NEXT:    flw ft10, 72(a1)
-; LP64F-NEXT:    flw ft11, 76(a1)
-; LP64F-NEXT:    flw fs0, 80(a1)
-; LP64F-NEXT:    flw fs1, 84(a1)
-; LP64F-NEXT:    flw fs2, 88(a1)
-; LP64F-NEXT:    flw fs3, 92(a1)
-; LP64F-NEXT:    flw fs4, 96(a1)
-; LP64F-NEXT:    flw fs5, 100(a1)
-; LP64F-NEXT:    flw fs6, 104(a1)
-; LP64F-NEXT:    flw fs7, 108(a1)
+; LP64F-NEXT:    flw fa4, 16(a1)
+; LP64F-NEXT:    flw fa3, 20(a1)
+; LP64F-NEXT:    flw fa2, 24(a1)
+; LP64F-NEXT:    flw fa1, 28(a1)
+; LP64F-NEXT:    flw fa0, 32(a1)
+; LP64F-NEXT:    flw ft0, 36(a1)
+; LP64F-NEXT:    flw ft1, 40(a1)
+; LP64F-NEXT:    flw ft2, 44(a1)
+; LP64F-NEXT:    flw ft3, 48(a1)
+; LP64F-NEXT:    flw ft4, 52(a1)
+; LP64F-NEXT:    flw ft5, 56(a1)
+; LP64F-NEXT:    flw ft6, 60(a1)
+; LP64F-NEXT:    flw ft7, 64(a1)
+; LP64F-NEXT:    flw fa6, 68(a1)
+; LP64F-NEXT:    flw fa7, 72(a1)
+; LP64F-NEXT:    flw ft8, 76(a1)
+; LP64F-NEXT:    flw ft9, 80(a1)
+; LP64F-NEXT:    flw ft10, 84(a1)
+; LP64F-NEXT:    flw ft11, 88(a1)
+; LP64F-NEXT:    flw fs0, 92(a1)
+; LP64F-NEXT:    flw fs1, 96(a1)
+; LP64F-NEXT:    flw fs2, 100(a1)
+; LP64F-NEXT:    flw fs3, 104(a1)
+; LP64F-NEXT:    flw fs4, 108(a1)
+; LP64F-NEXT:    flw fs5, 112(a1)
+; LP64F-NEXT:    flw fs6, 116(a1)
+; LP64F-NEXT:    flw fs7, 120(a1)
 ; LP64F-NEXT:    flw fs8, 124(a1)
-; LP64F-NEXT:    flw fs9, 120(a1)
-; LP64F-NEXT:    flw fs10, 116(a1)
-; LP64F-NEXT:    flw fs11, 112(a1)
+; LP64F-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64F-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64F-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64F-NEXT:    fsw fs8, 124(a1)
-; LP64F-NEXT:    fsw fs9, 120(a1)
-; LP64F-NEXT:    fsw fs10, 116(a1)
-; LP64F-NEXT:    fsw fs11, 112(a1)
-; LP64F-NEXT:    fsw fs7, 108(a1)
-; LP64F-NEXT:    fsw fs6, 104(a1)
-; LP64F-NEXT:    fsw fs5, 100(a1)
-; LP64F-NEXT:    fsw fs4, 96(a1)
-; LP64F-NEXT:    fsw fs3, 92(a1)
-; LP64F-NEXT:    fsw fs2, 88(a1)
-; LP64F-NEXT:    fsw fs1, 84(a1)
-; LP64F-NEXT:    fsw fs0, 80(a1)
-; LP64F-NEXT:    fsw ft11, 76(a1)
-; LP64F-NEXT:    fsw ft10, 72(a1)
-; LP64F-NEXT:    fsw ft9, 68(a1)
-; LP64F-NEXT:    fsw ft8, 64(a1)
-; LP64F-NEXT:    fsw fa7, 60(a1)
-; LP64F-NEXT:    fsw fa6, 56(a1)
-; LP64F-NEXT:    fsw ft7, 52(a1)
-; LP64F-NEXT:    fsw ft6, 48(a1)
-; LP64F-NEXT:    fsw ft5, 44(a1)
-; LP64F-NEXT:    fsw ft4, 40(a1)
-; LP64F-NEXT:    fsw ft3, 36(a1)
-; LP64F-NEXT:    fsw ft2, 32(a1)
-; LP64F-NEXT:    fsw ft1, 28(a1)
-; LP64F-NEXT:    fsw ft0, 24(a1)
-; LP64F-NEXT:    fsw fa0, 20(a1)
-; LP64F-NEXT:    fsw fa1, 16(a1)
-; LP64F-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64F-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64F-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64F-NEXT:    fsw fs7, 120(a1)
+; LP64F-NEXT:    fsw fs6, 116(a1)
+; LP64F-NEXT:    fsw fs5, 112(a1)
+; LP64F-NEXT:    fsw fs4, 108(a1)
+; LP64F-NEXT:    fsw fs3, 104(a1)
+; LP64F-NEXT:    fsw fs2, 100(a1)
+; LP64F-NEXT:    fsw fs1, 96(a1)
+; LP64F-NEXT:    fsw fs0, 92(a1)
+; LP64F-NEXT:    fsw ft11, 88(a1)
+; LP64F-NEXT:    fsw ft10, 84(a1)
+; LP64F-NEXT:    fsw ft9, 80(a1)
+; LP64F-NEXT:    fsw ft8, 76(a1)
+; LP64F-NEXT:    fsw fa7, 72(a1)
+; LP64F-NEXT:    fsw fa6, 68(a1)
+; LP64F-NEXT:    fsw ft7, 64(a1)
+; LP64F-NEXT:    fsw ft6, 60(a1)
+; LP64F-NEXT:    fsw ft5, 56(a1)
+; LP64F-NEXT:    fsw ft4, 52(a1)
+; LP64F-NEXT:    fsw ft3, 48(a1)
+; LP64F-NEXT:    fsw ft2, 44(a1)
+; LP64F-NEXT:    fsw ft1, 40(a1)
+; LP64F-NEXT:    fsw ft0, 36(a1)
+; LP64F-NEXT:    fsw fa0, 32(a1)
+; LP64F-NEXT:    fsw fa1, 28(a1)
+; LP64F-NEXT:    fsw fa2, 24(a1)
+; LP64F-NEXT:    fsw fa3, 20(a1)
+; LP64F-NEXT:    fsw fa4, 16(a1)
+; LP64F-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64F-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64F-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64F-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64F-NEXT:    flw fs0, 44(sp) # 4-byte Folded Reload
 ; LP64F-NEXT:    flw fs1, 40(sp) # 4-byte Folded Reload
@@ -370,69 +370,69 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    lui a0, %hi(var)
 ; ILP32D-NEXT:    flw fa5, %lo(var)(a0)
-; ILP32D-NEXT:    flw fa4, %lo(var+4)(a0)
-; ILP32D-NEXT:    flw fa3, %lo(var+8)(a0)
-; ILP32D-NEXT:    flw fa2, %lo(var+12)(a0)
 ; ILP32D-NEXT:    addi a1, a0, %lo(var)
-; ILP32D-NEXT:    flw fa1, 16(a1)
-; ILP32D-NEXT:    flw fa0, 20(a1)
-; ILP32D-NEXT:    flw ft0, 24(a1)
-; ILP32D-NEXT:    flw ft1, 28(a1)
-; ILP32D-NEXT:    flw ft2, 32(a1)
-; ILP32D-NEXT:    flw ft3, 36(a1)
-; ILP32D-NEXT:    flw ft4, 40(a1)
-; ILP32D-NEXT:    flw ft5, 44(a1)
-; ILP32D-NEXT:    flw ft6, 48(a1)
-; ILP32D-NEXT:    flw ft7, 52(a1)
-; ILP32D-NEXT:    flw fa6, 56(a1)
-; ILP32D-NEXT:    flw fa7, 60(a1)
-; ILP32D-NEXT:    flw ft8, 64(a1)
-; ILP32D-NEXT:    flw ft9, 68(a1)
-; ILP32D-NEXT:    flw ft10, 72(a1)
-; ILP32D-NEXT:    flw ft11, 76(a1)
-; ILP32D-NEXT:    flw fs0, 80(a1)
-; ILP32D-NEXT:    flw fs1, 84(a1)
-; ILP32D-NEXT:    flw fs2, 88(a1)
-; ILP32D-NEXT:    flw fs3, 92(a1)
-; ILP32D-NEXT:    flw fs4, 96(a1)
-; ILP32D-NEXT:    flw fs5, 100(a1)
-; ILP32D-NEXT:    flw fs6, 104(a1)
-; ILP32D-NEXT:    flw fs7, 108(a1)
+; ILP32D-NEXT:    flw fa4, 16(a1)
+; ILP32D-NEXT:    flw fa3, 20(a1)
+; ILP32D-NEXT:    flw fa2, 24(a1)
+; ILP32D-NEXT:    flw fa1, 28(a1)
+; ILP32D-NEXT:    flw fa0, 32(a1)
+; ILP32D-NEXT:    flw ft0, 36(a1)
+; ILP32D-NEXT:    flw ft1, 40(a1)
+; ILP32D-NEXT:    flw ft2, 44(a1)
+; ILP32D-NEXT:    flw ft3, 48(a1)
+; ILP32D-NEXT:    flw ft4, 52(a1)
+; ILP32D-NEXT:    flw ft5, 56(a1)
+; ILP32D-NEXT:    flw ft6, 60(a1)
+; ILP32D-NEXT:    flw ft7, 64(a1)
+; ILP32D-NEXT:    flw fa6, 68(a1)
+; ILP32D-NEXT:    flw fa7, 72(a1)
+; ILP32D-NEXT:    flw ft8, 76(a1)
+; ILP32D-NEXT:    flw ft9, 80(a1)
+; ILP32D-NEXT:    flw ft10, 84(a1)
+; ILP32D-NEXT:    flw ft11, 88(a1)
+; ILP32D-NEXT:    flw fs0, 92(a1)
+; ILP32D-NEXT:    flw fs1, 96(a1)
+; ILP32D-NEXT:    flw fs2, 100(a1)
+; ILP32D-NEXT:    flw fs3, 104(a1)
+; ILP32D-NEXT:    flw fs4, 108(a1)
+; ILP32D-NEXT:    flw fs5, 112(a1)
+; ILP32D-NEXT:    flw fs6, 116(a1)
+; ILP32D-NEXT:    flw fs7, 120(a1)
 ; ILP32D-NEXT:    flw fs8, 124(a1)
-; ILP32D-NEXT:    flw fs9, 120(a1)
-; ILP32D-NEXT:    flw fs10, 116(a1)
-; ILP32D-NEXT:    flw fs11, 112(a1)
+; ILP32D-NEXT:    flw fs9, %lo(var+4)(a0)
+; ILP32D-NEXT:    flw fs10, %lo(var+8)(a0)
+; ILP32D-NEXT:    flw fs11, %lo(var+12)(a0)
 ; ILP32D-NEXT:    fsw fs8, 124(a1)
-; ILP32D-NEXT:    fsw fs9, 120(a1)
-; ILP32D-NEXT:    fsw fs10, 116(a1)
-; ILP32D-NEXT:    fsw fs11, 112(a1)
-; ILP32D-NEXT:    fsw fs7, 108(a1)
-; ILP32D-NEXT:    fsw fs6, 104(a1)
-; ILP32D-NEXT:    fsw fs5, 100(a1)
-; ILP32D-NEXT:    fsw fs4, 96(a1)
-; ILP32D-NEXT:    fsw fs3, 92(a1)
-; ILP32D-NEXT:    fsw fs2, 88(a1)
-; ILP32D-NEXT:    fsw fs1, 84(a1)
-; ILP32D-NEXT:    fsw fs0, 80(a1)
-; ILP32D-NEXT:    fsw ft11, 76(a1)
-; ILP32D-NEXT:    fsw ft10, 72(a1)
-; ILP32D-NEXT:    fsw ft9, 68(a1)
-; ILP32D-NEXT:    fsw ft8, 64(a1)
-; ILP32D-NEXT:    fsw fa7, 60(a1)
-; ILP32D-NEXT:    fsw fa6, 56(a1)
-; ILP32D-NEXT:    fsw ft7, 52(a1)
-; ILP32D-NEXT:    fsw ft6, 48(a1)
-; ILP32D-NEXT:    fsw ft5, 44(a1)
-; ILP32D-NEXT:    fsw ft4, 40(a1)
-; ILP32D-NEXT:    fsw ft3, 36(a1)
-; ILP32D-NEXT:    fsw ft2, 32(a1)
-; ILP32D-NEXT:    fsw ft1, 28(a1)
-; ILP32D-NEXT:    fsw ft0, 24(a1)
-; ILP32D-NEXT:    fsw fa0, 20(a1)
-; ILP32D-NEXT:    fsw fa1, 16(a1)
-; ILP32D-NEXT:    fsw fa2, %lo(var+12)(a0)
-; ILP32D-NEXT:    fsw fa3, %lo(var+8)(a0)
-; ILP32D-NEXT:    fsw fa4, %lo(var+4)(a0)
+; ILP32D-NEXT:    fsw fs7, 120(a1)
+; ILP32D-NEXT:    fsw fs6, 116(a1)
+; ILP32D-NEXT:    fsw fs5, 112(a1)
+; ILP32D-NEXT:    fsw fs4, 108(a1)
+; ILP32D-NEXT:    fsw fs3, 104(a1)
+; ILP32D-NEXT:    fsw fs2, 100(a1)
+; ILP32D-NEXT:    fsw fs1, 96(a1)
+; ILP32D-NEXT:    fsw fs0, 92(a1)
+; ILP32D-NEXT:    fsw ft11, 88(a1)
+; ILP32D-NEXT:    fsw ft10, 84(a1)
+; ILP32D-NEXT:    fsw ft9, 80(a1)
+; ILP32D-NEXT:    fsw ft8, 76(a1)
+; ILP32D-NEXT:    fsw fa7, 72(a1)
+; ILP32D-NEXT:    fsw fa6, 68(a1)
+; ILP32D-NEXT:    fsw ft7, 64(a1)
+; ILP32D-NEXT:    fsw ft6, 60(a1)
+; ILP32D-NEXT:    fsw ft5, 56(a1)
+; ILP32D-NEXT:    fsw ft4, 52(a1)
+; ILP32D-NEXT:    fsw ft3, 48(a1)
+; ILP32D-NEXT:    fsw ft2, 44(a1)
+; ILP32D-NEXT:    fsw ft1, 40(a1)
+; ILP32D-NEXT:    fsw ft0, 36(a1)
+; ILP32D-NEXT:    fsw fa0, 32(a1)
+; ILP32D-NEXT:    fsw fa1, 28(a1)
+; ILP32D-NEXT:    fsw fa2, 24(a1)
+; ILP32D-NEXT:    fsw fa3, 20(a1)
+; ILP32D-NEXT:    fsw fa4, 16(a1)
+; ILP32D-NEXT:    fsw fs11, %lo(var+12)(a0)
+; ILP32D-NEXT:    fsw fs10, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; ILP32D-NEXT:    fsw fa5, %lo(var)(a0)
 ; ILP32D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
@@ -466,69 +466,69 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    lui a0, %hi(var)
 ; LP64D-NEXT:    flw fa5, %lo(var)(a0)
-; LP64D-NEXT:    flw fa4, %lo(var+4)(a0)
-; LP64D-NEXT:    flw fa3, %lo(var+8)(a0)
-; LP64D-NEXT:    flw fa2, %lo(var+12)(a0)
 ; LP64D-NEXT:    addi a1, a0, %lo(var)
-; LP64D-NEXT:    flw fa1, 16(a1)
-; LP64D-NEXT:    flw fa0, 20(a1)
-; LP64D-NEXT:    flw ft0, 24(a1)
-; LP64D-NEXT:    flw ft1, 28(a1)
-; LP64D-NEXT:    flw ft2, 32(a1)
-; LP64D-NEXT:    flw ft3, 36(a1)
-; LP64D-NEXT:    flw ft4, 40(a1)
-; LP64D-NEXT:    flw ft5, 44(a1)
-; LP64D-NEXT:    flw ft6, 48(a1)
-; LP64D-NEXT:    flw ft7, 52(a1)
-; LP64D-NEXT:    flw fa6, 56(a1)
-; LP64D-NEXT:    flw fa7, 60(a1)
-; LP64D-NEXT:    flw ft8, 64(a1)
-; LP64D-NEXT:    flw ft9, 68(a1)
-; LP64D-NEXT:    flw ft10, 72(a1)
-; LP64D-NEXT:    flw ft11, 76(a1)
-; LP64D-NEXT:    flw fs0, 80(a1)
-; LP64D-NEXT:    flw fs1, 84(a1)
-; LP64D-NEXT:    flw fs2, 88(a1)
-; LP64D-NEXT:    flw fs3, 92(a1)
-; LP64D-NEXT:    flw fs4, 96(a1)
-; LP64D-NEXT:    flw fs5, 100(a1)
-; LP64D-NEXT:    flw fs6, 104(a1)
-; LP64D-NEXT:    flw fs7, 108(a1)
+; LP64D-NEXT:    flw fa4, 16(a1)
+; LP64D-NEXT:    flw fa3, 20(a1)
+; LP64D-NEXT:    flw fa2, 24(a1)
+; LP64D-NEXT:    flw fa1, 28(a1)
+; LP64D-NEXT:    flw fa0, 32(a1)
+; LP64D-NEXT:    flw ft0, 36(a1)
+; LP64D-NEXT:    flw ft1, 40(a1)
+; LP64D-NEXT:    flw ft2, 44(a1)
+; LP64D-NEXT:    flw ft3, 48(a1)
+; LP64D-NEXT:    flw ft4, 52(a1)
+; LP64D-NEXT:    flw ft5, 56(a1)
+; LP64D-NEXT:    flw ft6, 60(a1)
+; LP64D-NEXT:    flw ft7, 64(a1)
+; LP64D-NEXT:    flw fa6, 68(a1)
+; LP64D-NEXT:    flw fa7, 72(a1)
+; LP64D-NEXT:    flw ft8, 76(a1)
+; LP64D-NEXT:    flw ft9, 80(a1)
+; LP64D-NEXT:    flw ft10, 84(a1)
+; LP64D-NEXT:    flw ft11, 88(a1)
+; LP64D-NEXT:    flw fs0, 92(a1)
+; LP64D-NEXT:    flw fs1, 96(a1)
+; LP64D-NEXT:    flw fs2, 100(a1)
+; LP64D-NEXT:    flw fs3, 104(a1)
+; LP64D-NEXT:    flw fs4, 108(a1)
+; LP64D-NEXT:    flw fs5, 112(a1)
+; LP64D-NEXT:    flw fs6, 116(a1)
+; LP64D-NEXT:    flw fs7, 120(a1)
 ; LP64D-NEXT:    flw fs8, 124(a1)
-; LP64D-NEXT:    flw fs9, 120(a1)
-; LP64D-NEXT:    flw fs10, 116(a1)
-; LP64D-NEXT:    flw fs11, 112(a1)
+; LP64D-NEXT:    flw fs9, %lo(var+4)(a0)
+; LP64D-NEXT:    flw fs10, %lo(var+8)(a0)
+; LP64D-NEXT:    flw fs11, %lo(var+12)(a0)
 ; LP64D-NEXT:    fsw fs8, 124(a1)
-; LP64D-NEXT:    fsw fs9, 120(a1)
-; LP64D-NEXT:    fsw fs10, 116(a1)
-; LP64D-NEXT:    fsw fs11, 112(a1)
-; LP64D-NEXT:    fsw fs7, 108(a1)
-; LP64D-NEXT:    fsw fs6, 104(a1)
-; LP64D-NEXT:    fsw fs5, 100(a1)
-; LP64D-NEXT:    fsw fs4, 96(a1)
-; LP64D-NEXT:    fsw fs3, 92(a1)
-; LP64D-NEXT:    fsw fs2, 88(a1)
-; LP64D-NEXT:    fsw fs1, 84(a1)
-; LP64D-NEXT:    fsw fs0, 80(a1)
-; LP64D-NEXT:    fsw ft11, 76(a1)
-; LP64D-NEXT:    fsw ft10, 72(a1)
-; LP64D-NEXT:    fsw ft9, 68(a1)
-; LP64D-NEXT:    fsw ft8, 64(a1)
-; LP64D-NEXT:    fsw fa7, 60(a1)
-; LP64D-NEXT:    fsw fa6, 56(a1)
-; LP64D-NEXT:    fsw ft7, 52(a1)
-; LP64D-NEXT:    fsw ft6, 48(a1)
-; LP64D-NEXT:    fsw ft5, 44(a1)
-; LP64D-NEXT:    fsw ft4, 40(a1)
-; LP64D-NEXT:    fsw ft3, 36(a1)
-; LP64D-NEXT:    fsw ft2, 32(a1)
-; LP64D-NEXT:    fsw ft1, 28(a1)
-; LP64D-NEXT:    fsw ft0, 24(a1)
-; LP64D-NEXT:    fsw fa0, 20(a1)
-; LP64D-NEXT:    fsw fa1, 16(a1)
-; LP64D-NEXT:    fsw fa2, %lo(var+12)(a0)
-; LP64D-NEXT:    fsw fa3, %lo(var+8)(a0)
-; LP64D-NEXT:    fsw fa4, %lo(var+4)(a0)
+; LP64D-NEXT:    fsw fs7, 120(a1)
+; LP64D-NEXT:    fsw fs6, 116(a1)
+; LP64D-NEXT:    fsw fs5, 112(a1)
+; LP64D-NEXT:    fsw fs4, 108(a1)
+; LP64D-NEXT:    fsw fs3, 104(a1)
+; LP64D-NEXT:    fsw fs2, 100(a1)
+; LP64D-NEXT:    fsw fs1, 96(a1)
+; LP64D-NEXT:    fsw fs0, 92(a1)
+; LP64D-NEXT:    fsw ft11, 88(a1)
+; LP64D-NEXT:    fsw ft10, 84(a1)
+; LP64D-NEXT:    fsw ft9, 80(a1)
+; LP64D-NEXT:    fsw ft8, 76(a1)
+; LP64D-NEXT:    fsw fa7, 72(a1)
+; LP64D-NEXT:    fsw fa6, 68(a1)
+; LP64D-NEXT:    fsw ft7, 64(a1)
+; LP64D-NEXT:    fsw ft6, 60(a1)
+; LP64D-NEXT:    fsw ft5, 56(a1)
+; LP64D-NEXT:    fsw ft4, 52(a1)
+; LP64D-NEXT:    fsw ft3, 48(a1)
+; LP64D-NEXT:    fsw ft2, 44(a1)
+; LP64D-NEXT:    fsw ft1, 40(a1)
+; LP64D-NEXT:    fsw ft0, 36(a1)
+; LP64D-NEXT:    fsw fa0, 32(a1)
+; LP64D-NEXT:    fsw fa1, 28(a1)
+; LP64D-NEXT:    fsw fa2, 24(a1)
+; LP64D-NEXT:    fsw fa3, 20(a1)
+; LP64D-NEXT:    fsw fa4, 16(a1)
+; LP64D-NEXT:    fsw fs11, %lo(var+12)(a0)
+; LP64D-NEXT:    fsw fs10, %lo(var+8)(a0)
+; LP64D-NEXT:    fsw fs9, %lo(var+4)(a0)
 ; LP64D-NEXT:    fsw fa5, %lo(var)(a0)
 ; LP64D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
index 40076316bca8926..a7f582f5f0699a1 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll
@@ -20,141 +20,141 @@ define void @callee() nounwind {
 ; ILP32-LABEL: callee:
 ; ILP32:       # %bb.0:
 ; ILP32-NEXT:    lui a0, %hi(var)
-; ILP32-NEXT:    fld fa5, %lo(var)(a0)
-; ILP32-NEXT:    fld fa4, %lo(var+8)(a0)
 ; ILP32-NEXT:    addi a1, a0, %lo(var)
-; ILP32-NEXT:    fld fa3, 16(a1)
-; ILP32-NEXT:    fld fa2, 24(a1)
-; ILP32-NEXT:    fld fa1, 32(a1)
-; ILP32-NEXT:    fld fa0, 40(a1)
-; ILP32-NEXT:    fld ft0, 48(a1)
-; ILP32-NEXT:    fld ft1, 56(a1)
-; ILP32-NEXT:    fld ft2, 64(a1)
-; ILP32-NEXT:    fld ft3, 72(a1)
-; ILP32-NEXT:    fld ft4, 80(a1)
-; ILP32-NEXT:    fld ft5, 88(a1)
-; ILP32-NEXT:    fld ft6, 96(a1)
-; ILP32-NEXT:    fld ft7, 104(a1)
-; ILP32-NEXT:    fld fa6, 112(a1)
-; ILP32-NEXT:    fld fa7, 120(a1)
-; ILP32-NEXT:    fld ft8, 128(a1)
-; ILP32-NEXT:    fld ft9, 136(a1)
-; ILP32-NEXT:    fld ft10, 144(a1)
-; ILP32-NEXT:    fld ft11, 152(a1)
-; ILP32-NEXT:    fld fs0, 160(a1)
-; ILP32-NEXT:    fld fs1, 168(a1)
-; ILP32-NEXT:    fld fs2, 176(a1)
-; ILP32-NEXT:    fld fs3, 184(a1)
-; ILP32-NEXT:    fld fs4, 192(a1)
-; ILP32-NEXT:    fld fs5, 200(a1)
-; ILP32-NEXT:    fld fs6, 208(a1)
-; ILP32-NEXT:    fld fs7, 216(a1)
-; ILP32-NEXT:    fld fs8, 248(a1)
+; ILP32-NEXT:    fld fa5, 248(a1)
+; ILP32-NEXT:    fld fa4, 16(a1)
+; ILP32-NEXT:    fld fa3, 24(a1)
+; ILP32-NEXT:    fld fa2, 32(a1)
+; ILP32-NEXT:    fld fa1, 40(a1)
+; ILP32-NEXT:    fld fa0, 48(a1)
+; ILP32-NEXT:    fld ft0, 56(a1)
+; ILP32-NEXT:    fld ft1, 64(a1)
+; ILP32-NEXT:    fld ft2, 72(a1)
+; ILP32-NEXT:    fld ft3, 80(a1)
+; ILP32-NEXT:    fld ft4, 88(a1)
+; ILP32-NEXT:    fld ft5, 96(a1)
+; ILP32-NEXT:    fld ft6, 104(a1)
+; ILP32-NEXT:    fld ft7, 112(a1)
+; ILP32-NEXT:    fld fa6, 120(a1)
+; ILP32-NEXT:    fld fa7, 128(a1)
+; ILP32-NEXT:    fld ft8, 136(a1)
+; ILP32-NEXT:    fld ft9, 144(a1)
+; ILP32-NEXT:    fld ft10, 152(a1)
+; ILP32-NEXT:    fld ft11, 160(a1)
+; ILP32-NEXT:    fld fs0, 168(a1)
+; ILP32-NEXT:    fld fs1, 176(a1)
+; ILP32-NEXT:    fld fs2, 184(a1)
+; ILP32-NEXT:    fld fs3, 192(a1)
+; ILP32-NEXT:    fld fs4, 200(a1)
+; ILP32-NEXT:    fld fs5, 208(a1)
+; ILP32-NEXT:    fld fs6, 216(a1)
+; ILP32-NEXT:    fld fs7, 224(a1)
+; ILP32-NEXT:    fld fs8, 232(a1)
 ; ILP32-NEXT:    fld fs9, 240(a1)
-; ILP32-NEXT:    fld fs10, 232(a1)
-; ILP32-NEXT:    fld fs11, 224(a1)
-; ILP32-NEXT:    fsd fs8, 248(a1)
+; ILP32-NEXT:    fld fs10, %lo(var)(a0)
+; ILP32-NEXT:    fld fs11, %lo(var+8)(a0)
+; ILP32-NEXT:    fsd fa5, 248(a1)
 ; ILP32-NEXT:    fsd fs9, 240(a1)
-; ILP32-NEXT:    fsd fs10, 232(a1)
-; ILP32-NEXT:    fsd fs11, 224(a1)
-; ILP32-NEXT:    fsd fs7, 216(a1)
-; ILP32-NEXT:    fsd fs6, 208(a1)
-; ILP32-NEXT:    fsd fs5, 200(a1)
-; ILP32-NEXT:    fsd fs4, 192(a1)
-; ILP32-NEXT:    fsd fs3, 184(a1)
-; ILP32-NEXT:    fsd fs2, 176(a1)
-; ILP32-NEXT:    fsd fs1, 168(a1)
-; ILP32-NEXT:    fsd fs0, 160(a1)
-; ILP32-NEXT:    fsd ft11, 152(a1)
-; ILP32-NEXT:    fsd ft10, 144(a1)
-; ILP32-NEXT:    fsd ft9, 136(a1)
-; ILP32-NEXT:    fsd ft8, 128(a1)
-; ILP32-NEXT:    fsd fa7, 120(a1)
-; ILP32-NEXT:    fsd fa6, 112(a1)
-; ILP32-NEXT:    fsd ft7, 104(a1)
-; ILP32-NEXT:    fsd ft6, 96(a1)
-; ILP32-NEXT:    fsd ft5, 88(a1)
-; ILP32-NEXT:    fsd ft4, 80(a1)
-; ILP32-NEXT:    fsd ft3, 72(a1)
-; ILP32-NEXT:    fsd ft2, 64(a1)
-; ILP32-NEXT:    fsd ft1, 56(a1)
-; ILP32-NEXT:    fsd ft0, 48(a1)
-; ILP32-NEXT:    fsd fa0, 40(a1)
-; ILP32-NEXT:    fsd fa1, 32(a1)
-; ILP32-NEXT:    fsd fa2, 24(a1)
-; ILP32-NEXT:    fsd fa3, 16(a1)
-; ILP32-NEXT:    fsd fa4, %lo(var+8)(a0)
-; ILP32-NEXT:    fsd fa5, %lo(var)(a0)
+; ILP32-NEXT:    fsd fs8, 232(a1)
+; ILP32-NEXT:    fsd fs7, 224(a1)
+; ILP32-NEXT:    fsd fs6, 216(a1)
+; ILP32-NEXT:    fsd fs5, 208(a1)
+; ILP32-NEXT:    fsd fs4, 200(a1)
+; ILP32-NEXT:    fsd fs3, 192(a1)
+; ILP32-NEXT:    fsd fs2, 184(a1)
+; ILP32-NEXT:    fsd fs1, 176(a1)
+; ILP32-NEXT:    fsd fs0, 168(a1)
+; ILP32-NEXT:    fsd ft11, 160(a1)
+; ILP32-NEXT:    fsd ft10, 152(a1)
+; ILP32-NEXT:    fsd ft9, 144(a1)
+; ILP32-NEXT:    fsd ft8, 136(a1)
+; ILP32-NEXT:    fsd fa7, 128(a1)
+; ILP32-NEXT:    fsd fa6, 120(a1)
+; ILP32-NEXT:    fsd ft7, 112(a1)
+; ILP32-NEXT:    fsd ft6, 104(a1)
+; ILP32-NEXT:    fsd ft5, 96(a1)
+; ILP32-NEXT:    fsd ft4, 88(a1)
+; ILP32-NEXT:    fsd ft3, 80(a1)
+; ILP32-NEXT:    fsd ft2, 72(a1)
+; ILP32-NEXT:    fsd ft1, 64(a1)
+; ILP32-NEXT:    fsd ft0, 56(a1)
+; ILP32-NEXT:    fsd fa0, 48(a1)
+; ILP32-NEXT:    fsd fa1, 40(a1)
+; ILP32-NEXT:    fsd fa2, 32(a1)
+; ILP32-NEXT:    fsd fa3, 24(a1)
+; ILP32-NEXT:    fsd fa4, 16(a1)
+; ILP32-NEXT:    fsd fs11, %lo(var+8)(a0)
+; ILP32-NEXT:    fsd fs10, %lo(var)(a0)
 ; ILP32-NEXT:    ret
 ;
 ; LP64-LABEL: callee:
 ; LP64:       # %bb.0:
 ; LP64-NEXT:    lui a0, %hi(var)
-; LP64-NEXT:    fld fa5, %lo(var)(a0)
-; LP64-NEXT:    fld fa4, %lo(var+8)(a0)
 ; LP64-NEXT:    addi a1, a0, %lo(var)
-; LP64-NEXT:    fld fa3, 16(a1)
-; LP64-NEXT:    fld fa2, 24(a1)
-; LP64-NEXT:    fld fa1, 32(a1)
-; LP64-NEXT:    fld fa0, 40(a1)
-; LP64-NEXT:    fld ft0, 48(a1)
-; LP64-NEXT:    fld ft1, 56(a1)
-; LP64-NEXT:    fld ft2, 64(a1)
-; LP64-NEXT:    fld ft3, 72(a1)
-; LP64-NEXT:    fld ft4, 80(a1)
-; LP64-NEXT:    fld ft5, 88(a1)
-; LP64-NEXT:    fld ft6, 96(a1)
-; LP64-NEXT:    fld ft7, 104(a1)
-; LP64-NEXT:    fld fa6, 112(a1)
-; LP64-NEXT:    fld fa7, 120(a1)
-; LP64-NEXT:    fld ft8, 128(a1)
-; LP64-NEXT:    fld ft9, 136(a1)
-; LP64-NEXT:    fld ft10, 144(a1)
-; LP64-NEXT:    fld ft11, 152(a1)
-; LP64-NEXT:    fld fs0, 160(a1)
-; LP64-NEXT:    fld fs1, 168(a1)
-; LP64-NEXT:    fld fs2, 176(a1)
-; LP64-NEXT:    fld fs3, 184(a1)
-; LP64-NEXT:    fld fs4, 192(a1)
-; LP64-NEXT:    fld fs5, 200(a1)
-; LP64-NEXT:    fld fs6, 208(a1)
-; LP64-NEXT:    fld fs7, 216(a1)
-; LP64-NEXT:    fld fs8, 248(a1)
+; LP64-NEXT:    fld fa5, 248(a1)
+; LP64-NEXT:    fld fa4, 16(a1)
+; LP64-NEXT:    fld fa3, 24(a1)
+; LP64-NEXT:    fld fa2, 32(a1)
+; LP64-NEXT:    fld fa1, 40(a1)
+; LP64-NEXT:    fld fa0, 48(a1)
+; LP64-NEXT:    fld ft0, 56(a1)
+; LP64-NEXT:    fld ft1, 64(a1)
+; LP64-NEXT:    fld ft2, 72(a1)
+; LP64-NEXT:    fld ft3, 80(a1)
+; LP64-NEXT:    fld ft4, 88(a1)
+; LP64-NEXT:    fld ft5, 96(a1)
+; LP64-NEXT:    fld ft6, 104(a1)
+; LP64-NEXT:    fld ft7, 112(a1)
+; LP64-NEXT:    fld fa6, 120(a1)
+; LP64-NEXT:    fld fa7, 128(a1)
+; LP64-NEXT:    fld ft8, 136(a1)
+; LP64-NEXT:    fld ft9, 144(a1)
+; LP64-NEXT:    fld ft10, 152(a1)
+; LP64-NEXT:    fld ft11, 160(a1)
+; LP64-NEXT:    fld fs0, 168(a1)
+; LP64-NEXT:    fld fs1, 176(a1)
+; LP64-NEXT:    fld fs2, 184(a1)
+; LP64-NEXT:    fld fs3, 192(a1)
+; LP64-NEXT:    fld fs4, 200(a1)
+; LP64-NEXT:    fld fs5, 208(a1)
+; LP64-NEXT:    fld fs6, 216(a1)
+; LP64-NEXT:    fld fs7, 224(a1)
+; LP64-NEXT:    fld fs8, 232(a1)
 ; LP64-NEXT:    fld fs9, 240(a1)
-; LP64-NEXT:    fld fs10, 232(a1)
-; LP64-NEXT:    fld fs11, 224(a1)
-; LP64-NEXT:    fsd fs8, 248(a1)
+; LP64-NEXT:    fld fs10, %lo(var)(a0)
+; LP64-NEXT:    fld fs11, %lo(var+8)(a0)
+; LP64-NEXT:    fsd fa5, 248(a1)
 ; LP64-NEXT:    fsd fs9, 240(a1)
-; LP64-NEXT:    fsd fs10, 232(a1)
-; LP64-NEXT:    fsd fs11, 224(a1)
-; LP64-NEXT:    fsd fs7, 216(a1)
-; LP64-NEXT:    fsd fs6, 208(a1)
-; LP64-NEXT:    fsd fs5, 200(a1)
-; LP64-NEXT:    fsd fs4, 192(a1)
-; LP64-NEXT:    fsd fs3, 184(a1)
-; LP64-NEXT:    fsd fs2, 176(a1)
-; LP64-NEXT:    fsd fs1, 168(a1)
-; LP64-NEXT:    fsd fs0, 160(a1)
-; LP64-NEXT:    fsd ft11, 152(a1)
-; LP64-NEXT:    fsd ft10, 144(a1)
-; LP64-NEXT:    fsd ft9, 136(a1)
-; LP64-NEXT:    fsd ft8, 128(a1)
-; LP64-NEXT:    fsd fa7, 120(a1)
-; LP64-NEXT:    fsd fa6, 112(a1)
-; LP64-NEXT:    fsd ft7, 104(a1)
-; LP64-NEXT:    fsd ft6, 96(a1)
-; LP64-NEXT:    fsd ft5, 88(a1)
-; LP64-NEXT:    fsd ft4, 80(a1)
-; LP64-NEXT:    fsd ft3, 72(a1)
-; LP64-NEXT:    fsd ft2, 64(a1)
-; LP64-NEXT:    fsd ft1, 56(a1)
-; LP64-NEXT:    fsd ft0, 48(a1)
-; LP64-NEXT:    fsd fa0, 40(a1)
-; LP64-NEXT:    fsd fa1, 32(a1)
-; LP64-NEXT:    fsd fa2, 24(a1)
-; LP64-NEXT:    fsd fa3, 16(a1)
-; LP64-NEXT:    fsd fa4, %lo(var+8)(a0)
-; LP64-NEXT:    fsd fa5, %lo(var)(a0)
+; LP64-NEXT:    fsd fs8, 232(a1)
+; LP64-NEXT:    fsd fs7, 224(a1)
+; LP64-NEXT:    fsd fs6, 216(a1)
+; LP64-NEXT:    fsd fs5, 208(a1)
+; LP64-NEXT:    fsd fs4, 200(a1)
+; LP64-NEXT:    fsd fs3, 192(a1)
+; LP64-NEXT:    fsd fs2, 184(a1)
+; LP64-NEXT:    fsd fs1, 176(a1)
+; LP64-NEXT:    fsd fs0, 168(a1)
+; LP64-NEXT:    fsd ft11, 160(a1)
+; LP64-NEXT:    fsd ft10, 152(a1)
+; LP64-NEXT:    fsd ft9, 144(a1)
+; LP64-NEXT:    fsd ft8, 136(a1)
+; LP64-NEXT:    fsd fa7, 128(a1)
+; LP64-NEXT:    fsd fa6, 120(a1)
+; LP64-NEXT:    fsd ft7, 112(a1)
+; LP64-NEXT:    fsd ft6, 104(a1)
+; LP64-NEXT:    fsd ft5, 96(a1)
+; LP64-NEXT:    fsd ft4, 88(a1)
+; LP64-NEXT:    fsd ft3, 80(a1)
+; LP64-NEXT:    fsd ft2, 72(a1)
+; LP64-NEXT:    fsd ft1, 64(a1)
+; LP64-NEXT:    fsd ft0, 56(a1)
+; LP64-NEXT:    fsd fa0, 48(a1)
+; LP64-NEXT:    fsd fa1, 40(a1)
+; LP64-NEXT:    fsd fa2, 32(a1)
+; LP64-NEXT:    fsd fa3, 24(a1)
+; LP64-NEXT:    fsd fa4, 16(a1)
+; LP64-NEXT:    fsd fs11, %lo(var+8)(a0)
+; LP64-NEXT:    fsd fs10, %lo(var)(a0)
 ; LP64-NEXT:    ret
 ;
 ; ILP32D-LABEL: callee:
@@ -173,71 +173,71 @@ define void @callee() nounwind {
 ; ILP32D-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; ILP32D-NEXT:    lui a0, %hi(var)
-; ILP32D-NEXT:    fld fa5, %lo(var)(a0)
-; ILP32D-NEXT:    fld fa4, %lo(var+8)(a0)
 ; ILP32D-NEXT:    addi a1, a0, %lo(var)
-; ILP32D-NEXT:    fld fa3, 16(a1)
-; ILP32D-NEXT:    fld fa2, 24(a1)
-; ILP32D-NEXT:    fld fa1, 32(a1)
-; ILP32D-NEXT:    fld fa0, 40(a1)
-; ILP32D-NEXT:    fld ft0, 48(a1)
-; ILP32D-NEXT:    fld ft1, 56(a1)
-; ILP32D-NEXT:    fld ft2, 64(a1)
-; ILP32D-NEXT:    fld ft3, 72(a1)
-; ILP32D-NEXT:    fld ft4, 80(a1)
-; ILP32D-NEXT:    fld ft5, 88(a1)
-; ILP32D-NEXT:    fld ft6, 96(a1)
-; ILP32D-NEXT:    fld ft7, 104(a1)
-; ILP32D-NEXT:    fld fa6, 112(a1)
-; ILP32D-NEXT:    fld fa7, 120(a1)
-; ILP32D-NEXT:    fld ft8, 128(a1)
-; ILP32D-NEXT:    fld ft9, 136(a1)
-; ILP32D-NEXT:    fld ft10, 144(a1)
-; ILP32D-NEXT:    fld ft11, 152(a1)
-; ILP32D-NEXT:    fld fs0, 160(a1)
-; ILP32D-NEXT:    fld fs1, 168(a1)
-; ILP32D-NEXT:    fld fs2, 176(a1)
-; ILP32D-NEXT:    fld fs3, 184(a1)
-; ILP32D-NEXT:    fld fs4, 192(a1)
-; ILP32D-NEXT:    fld fs5, 200(a1)
-; ILP32D-NEXT:    fld fs6, 208(a1)
-; ILP32D-NEXT:    fld fs7, 216(a1)
-; ILP32D-NEXT:    fld fs8, 248(a1)
+; ILP32D-NEXT:    fld fa5, 248(a1)
+; ILP32D-NEXT:    fld fa4, 16(a1)
+; ILP32D-NEXT:    fld fa3, 24(a1)
+; ILP32D-NEXT:    fld fa2, 32(a1)
+; ILP32D-NEXT:    fld fa1, 40(a1)
+; ILP32D-NEXT:    fld fa0, 48(a1)
+; ILP32D-NEXT:    fld ft0, 56(a1)
+; ILP32D-NEXT:    fld ft1, 64(a1)
+; ILP32D-NEXT:    fld ft2, 72(a1)
+; ILP32D-NEXT:    fld ft3, 80(a1)
+; ILP32D-NEXT:    fld ft4, 88(a1)
+; ILP32D-NEXT:    fld ft5, 96(a1)
+; ILP32D-NEXT:    fld ft6, 104(a1)
+; ILP32D-NEXT:    fld ft7, 112(a1)
+; ILP32D-NEXT:    fld fa6, 120(a1)
+; ILP32D-NEXT:    fld fa7, 128(a1)
+; ILP32D-NEXT:    fld ft8, 136(a1)
+; ILP32D-NEXT:    fld ft9, 144(a1)
+; ILP32D-NEXT:    fld ft10, 152(a1)
+; ILP32D-NEXT:    fld ft11, 160(a1)
+; ILP32D-NEXT:    fld fs0, 168(a1)
+; ILP32D-NEXT:    fld fs1, 176(a1)
+; ILP32D-NEXT:    fld fs2, 184(a1)
+; ILP32D-NEXT:    fld fs3, 192(a1)
+; ILP32D-NEXT:    fld fs4, 200(a1)
+; ILP32D-NEXT:    fld fs5, 208(a1)
+; ILP32D-NEXT:    fld fs6, 216(a1)
+; ILP32D-NEXT:    fld fs7, 224(a1)
+; ILP32D-NEXT:    fld fs8, 232(a1)
 ; ILP32D-NEXT:    fld fs9, 240(a1)
-; ILP32D-NEXT:    fld fs10, 232(a1)
-; ILP32D-NEXT:    fld fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs8, 248(a1)
+; ILP32D-NEXT:    fld fs10, %lo(var)(a0)
+; ILP32D-NEXT:    fld fs11, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsd fa5, 248(a1)
 ; ILP32D-NEXT:    fsd fs9, 240(a1)
-; ILP32D-NEXT:    fsd fs10, 232(a1)
-; ILP32D-NEXT:    fsd fs11, 224(a1)
-; ILP32D-NEXT:    fsd fs7, 216(a1)
-; ILP32D-NEXT:    fsd fs6, 208(a1)
-; ILP32D-NEXT:    fsd fs5, 200(a1)
-; ILP32D-NEXT:    fsd fs4, 192(a1)
-; ILP32D-NEXT:    fsd fs3, 184(a1)
-; ILP32D-NEXT:    fsd fs2, 176(a1)
-; ILP32D-NEXT:    fsd fs1, 168(a1)
-; ILP32D-NEXT:    fsd fs0, 160(a1)
-; ILP32D-NEXT:    fsd ft11, 152(a1)
-; ILP32D-NEXT:    fsd ft10, 144(a1)
-; ILP32D-NEXT:    fsd ft9, 136(a1)
-; ILP32D-NEXT:    fsd ft8, 128(a1)
-; ILP32D-NEXT:    fsd fa7, 120(a1)
-; ILP32D-NEXT:    fsd fa6, 112(a1)
-; ILP32D-NEXT:    fsd ft7, 104(a1)
-; ILP32D-NEXT:    fsd ft6, 96(a1)
-; ILP32D-NEXT:    fsd ft5, 88(a1)
-; ILP32D-NEXT:    fsd ft4, 80(a1)
-; ILP32D-NEXT:    fsd ft3, 72(a1)
-; ILP32D-NEXT:    fsd ft2, 64(a1)
-; ILP32D-NEXT:    fsd ft1, 56(a1)
-; ILP32D-NEXT:    fsd ft0, 48(a1)
-; ILP32D-NEXT:    fsd fa0, 40(a1)
-; ILP32D-NEXT:    fsd fa1, 32(a1)
-; ILP32D-NEXT:    fsd fa2, 24(a1)
-; ILP32D-NEXT:    fsd fa3, 16(a1)
-; ILP32D-NEXT:    fsd fa4, %lo(var+8)(a0)
-; ILP32D-NEXT:    fsd fa5, %lo(var)(a0)
+; ILP32D-NEXT:    fsd fs8, 232(a1)
+; ILP32D-NEXT:    fsd fs7, 224(a1)
+; ILP32D-NEXT:    fsd fs6, 216(a1)
+; ILP32D-NEXT:    fsd fs5, 208(a1)
+; ILP32D-NEXT:    fsd fs4, 200(a1)
+; ILP32D-NEXT:    fsd fs3, 192(a1)
+; ILP32D-NEXT:    fsd fs2, 184(a1)
+; ILP32D-NEXT:    fsd fs1, 176(a1)
+; ILP32D-NEXT:    fsd fs0, 168(a1)
+; ILP32D-NEXT:    fsd ft11, 160(a1)
+; ILP32D-NEXT:    fsd ft10, 152(a1)
+; ILP32D-NEXT:    fsd ft9, 144(a1)
+; ILP32D-NEXT:    fsd ft8, 136(a1)
+; ILP32D-NEXT:    fsd fa7, 128(a1)
+; ILP32D-NEXT:    fsd fa6, 120(a1)
+; ILP32D-NEXT:    fsd ft7, 112(a1)
+; ILP32D-NEXT:    fsd ft6, 104(a1)
+; ILP32D-NEXT:    fsd ft5, 96(a1)
+; ILP32D-NEXT:    fsd ft4, 88(a1)
+; ILP32D-NEXT:    fsd ft3, 80(a1)
+; ILP32D-NEXT:    fsd ft2, 72(a1)
+; ILP32D-NEXT:    fsd ft1, 64(a1)
+; ILP32D-NEXT:    fsd ft0, 56(a1)
+; ILP32D-NEXT:    fsd fa0, 48(a1)
+; ILP32D-NEXT:    fsd fa1, 40(a1)
+; ILP32D-NEXT:    fsd fa2, 32(a1)
+; ILP32D-NEXT:    fsd fa3, 24(a1)
+; ILP32D-NEXT:    fsd fa4, 16(a1)
+; ILP32D-NEXT:    fsd fs11, %lo(var+8)(a0)
+; ILP32D-NEXT:    fsd fs10, %lo(var)(a0)
 ; ILP32D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
 ; ILP32D-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
@@ -269,71 +269,71 @@ define void @callee() nounwind {
 ; LP64D-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    fsd fs11, 0(sp) # 8-byte Folded Spill
 ; LP64D-NEXT:    lui a0, %hi(var)
-; LP64D-NEXT:    fld fa5, %lo(var)(a0)
-; LP64D-NEXT:    fld fa4, %lo(var+8)(a0)
 ; LP64D-NEXT:    addi a1, a0, %lo(var)
-; LP64D-NEXT:    fld fa3, 16(a1)
-; LP64D-NEXT:    fld fa2, 24(a1)
-; LP64D-NEXT:    fld fa1, 32(a1)
-; LP64D-NEXT:    fld fa0, 40(a1)
-; LP64D-NEXT:    fld ft0, 48(a1)
-; LP64D-NEXT:    fld ft1, 56(a1)
-; LP64D-NEXT:    fld ft2, 64(a1)
-; LP64D-NEXT:    fld ft3, 72(a1)
-; LP64D-NEXT:    fld ft4, 80(a1)
-; LP64D-NEXT:    fld ft5, 88(a1)
-; LP64D-NEXT:    fld ft6, 96(a1)
-; LP64D-NEXT:    fld ft7, 104(a1)
-; LP64D-NEXT:    fld fa6, 112(a1)
-; LP64D-NEXT:    fld fa7, 120(a1)
-; LP64D-NEXT:    fld ft8, 128(a1)
-; LP64D-NEXT:    fld ft9, 136(a1)
-; LP64D-NEXT:    fld ft10, 144(a1)
-; LP64D-NEXT:    fld ft11, 152(a1)
-; LP64D-NEXT:    fld fs0, 160(a1)
-; LP64D-NEXT:    fld fs1, 168(a1)
-; LP64D-NEXT:    fld fs2, 176(a1)
-; LP64D-NEXT:    fld fs3, 184(a1)
-; LP64D-NEXT:    fld fs4, 192(a1)
-; LP64D-NEXT:    fld fs5, 200(a1)
-; LP64D-NEXT:    fld fs6, 208(a1)
-; LP64D-NEXT:    fld fs7, 216(a1)
-; LP64D-NEXT:    fld fs8, 248(a1)
+; LP64D-NEXT:    fld fa5, 248(a1)
+; LP64D-NEXT:    fld fa4, 16(a1)
+; LP64D-NEXT:    fld fa3, 24(a1)
+; LP64D-NEXT:    fld fa2, 32(a1)
+; LP64D-NEXT:    fld fa1, 40(a1)
+; LP64D-NEXT:    fld fa0, 48(a1)
+; LP64D-NEXT:    fld ft0, 56(a1)
+; LP64D-NEXT:    fld ft1, 64(a1)
+; LP64D-NEXT:    fld ft2, 72(a1)
+; LP64D-NEXT:    fld ft3, 80(a1)
+; LP64D-NEXT:    fld ft4, 88(a1)
+; LP64D-NEXT:    fld ft5, 96(a1)
+; LP64D-NEXT:    fld ft6, 104(a1)
+; LP64D-NEXT:    fld ft7, 112(a1)
+; LP64D-NEXT:    fld fa6, 120(a1)
+; LP64D-NEXT:    fld fa7, 128(a1)
+; LP64D-NEXT:    fld ft8, 136(a1)
+; LP64D-NEXT:    fld ft9, 144(a1)
+; LP64D-NEXT:    fld ft10, 152(a1)
+; LP64D-NEXT:    fld ft11, 160(a1)
+; LP64D-NEXT:    fld fs0, 168(a1)
+; LP64D-NEXT:    fld fs1, 176(a1)
+; LP64D-NEXT:    fld fs2, 184(a1)
+; LP64D-NEXT:    fld fs3, 192(a1)
+; LP64D-NEXT:    fld fs4, 200(a1)
+; LP64D-NEXT:    fld fs5, 208(a1)
+; LP64D-NEXT:    fld fs6, 216(a1)
+; LP64D-NEXT:    fld fs7, 224(a1)
+; LP64D-NEXT:    fld fs8, 232(a1)
 ; LP64D-NEXT:    fld fs9, 240(a1)
-; LP64D-NEXT:    fld fs10, 232(a1)
-; LP64D-NEXT:    fld fs11, 224(a1)
-; LP64D-NEXT:    fsd fs8, 248(a1)
+; LP64D-NEXT:    fld fs10, %lo(var)(a0)
+; LP64D-NEXT:    fld fs11, %lo(var+8)(a0)
+; LP64D-NEXT:    fsd fa5, 248(a1)
 ; LP64D-NEXT:    fsd fs9, 240(a1)
-; LP64D-NEXT:    fsd fs10, 232(a1)
-; LP64D-NEXT:    fsd fs11, 224(a1)
-; LP64D-NEXT:    fsd fs7, 216(a1)
-; LP64D-NEXT:    fsd fs6, 208(a1)
-; LP64D-NEXT:    fsd fs5, 200(a1)
-; LP64D-NEXT:    fsd fs4, 192(a1)
-; LP64D-NEXT:    fsd fs3, 184(a1)
-; LP64D-NEXT:    fsd fs2, 176(a1)
-; LP64D-NEXT:    fsd fs1, 168(a1)
-; LP64D-NEXT:    fsd fs0, 160(a1)
-; LP64D-NEXT:    fsd ft11, 152(a1)
-; LP64D-NEXT:    fsd ft10, 144(a1)
-; LP64D-NEXT:    fsd ft9, 136(a1)
-; LP64D-NEXT:    fsd ft8, 128(a1)
-; LP64D-NEXT:    fsd fa7, 120(a1)
-; LP64D-NEXT:    fsd fa6, 112(a1)
-; LP64D-NEXT:    fsd ft7, 104(a1)
-; LP64D-NEXT:    fsd ft6, 96(a1)
-; LP64D-NEXT:    fsd ft5, 88(a1)
-; LP64D-NEXT:    fsd ft4, 80(a1)
-; LP64D-NEXT:    fsd ft3, 72(a1)
-; LP64D-NEXT:    fsd ft2, 64(a1)
-; LP64D-NEXT:    fsd ft1, 56(a1)
-; LP64D-NEXT:    fsd ft0, 48(a1)
-; LP64D-NEXT:    fsd fa0, 40(a1)
-; LP64D-NEXT:    fsd fa1, 32(a1)
-; LP64D-NEXT:    fsd fa2, 24(a1)
-; LP64D-NEXT:    fsd fa3, 16(a1)
-; LP64D-NEXT:    fsd fa4, %lo(var+8)(a0)
-; LP64D-NEXT:    fsd fa5, %lo(var)(a0)
+; LP64D-NEXT:    fsd fs8, 232(a1)
+; LP64D-NEXT:    fsd fs7, 224(a1)
+; LP64D-NEXT:    fsd fs6, 216(a1)
+; LP64D-NEXT:    fsd fs5, 208(a1)
+; LP64D-NEXT:    fsd fs4, 200(a1)
+; LP64D-NEXT:    fsd fs3, 192(a1)
+; LP64D-NEXT:    fsd fs2, 184(a1)
+; LP64D-NEXT:    fsd fs1, 176(a1)
+; LP64D-NEXT:    fsd fs0, 168(a1)
+; LP64D-NEXT:    fsd ft11, 160(a1)
+; LP64D-NEXT:    fsd ft10, 152(a1)
+; LP64D-NEXT:    fsd ft9, 144(a1)
+; LP64D-NEXT:    fsd ft8, 136(a1)
+; LP64D-NEXT:    fsd fa7, 128(a1)
+; LP64D-NEXT:    fsd fa6, 120(a1)
+; LP64D-NEXT:    fsd ft7, 112(a1)
+; LP64D-NEXT:    fsd ft6, 104(a1)
+; LP64D-NEXT:    fsd ft5, 96(a1)
+; LP64D-NEXT:    fsd ft4, 88(a1)
+; LP64D-NEXT:    fsd ft3, 80(a1)
+; LP64D-NEXT:    fsd ft2, 72(a1)
+; LP64D-NEXT:    fsd ft1, 64(a1)
+; LP64D-NEXT:    fsd ft0, 56(a1)
+; LP64D-NEXT:    fsd fa0, 48(a1)
+; LP64D-NEXT:    fsd fa1, 40(a1)
+; LP64D-NEXT:    fsd fa2, 32(a1)
+; LP64D-NEXT:    fsd fa3, 24(a1)
+; LP64D-NEXT:    fsd fa4, 16(a1)
+; LP64D-NEXT:    fsd fs11, %lo(var+8)(a0)
+; LP64D-NEXT:    fsd fs10, %lo(var)(a0)
 ; LP64D-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
 ; LP64D-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 09ecbbc7e8feb81..a8ca5d02ff78dec 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -50,84 +50,84 @@ define void @callee() nounwind {
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var)
-; RV32I-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-NEXT:    lui a4, %hi(var)
+; RV32I-NEXT:    lw a0, %lo(var)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-NEXT:    sw a0, %lo(var)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -161,86 +161,86 @@ define void @callee() nounwind {
 ; RV32I-WITH-FP-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
 ; RV32I-WITH-FP-NEXT:    addi s0, sp, 80
-; RV32I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    lui a5, %hi(var)
+; RV32I-WITH-FP-NEXT:    lw a0, %lo(var)(a5)
 ; RV32I-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    addi a2, a5, %lo(var)
+; RV32I-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV32I-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -76(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV32I-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV32I-WITH-FP-NEXT:    sw a0, -80(s0) # 4-byte Folded Spill
-; RV32I-WITH-FP-NEXT:    lw t1, 28(a5)
-; RV32I-WITH-FP-NEXT:    lw t2, 32(a5)
-; RV32I-WITH-FP-NEXT:    lw t3, 36(a5)
-; RV32I-WITH-FP-NEXT:    lw t4, 40(a5)
-; RV32I-WITH-FP-NEXT:    lw t5, 44(a5)
-; RV32I-WITH-FP-NEXT:    lw t6, 48(a5)
-; RV32I-WITH-FP-NEXT:    lw s1, 52(a5)
-; RV32I-WITH-FP-NEXT:    lw s2, 56(a5)
-; RV32I-WITH-FP-NEXT:    lw s3, 60(a5)
-; RV32I-WITH-FP-NEXT:    lw s4, 64(a5)
-; RV32I-WITH-FP-NEXT:    lw s5, 68(a5)
-; RV32I-WITH-FP-NEXT:    lw s6, 72(a5)
-; RV32I-WITH-FP-NEXT:    lw s7, 76(a5)
-; RV32I-WITH-FP-NEXT:    lw s8, 80(a5)
-; RV32I-WITH-FP-NEXT:    lw s9, 84(a5)
-; RV32I-WITH-FP-NEXT:    lw s10, 88(a5)
-; RV32I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32I-WITH-FP-NEXT:    sw ra, 96(a5)
-; RV32I-WITH-FP-NEXT:    sw s11, 92(a5)
-; RV32I-WITH-FP-NEXT:    sw s10, 88(a5)
-; RV32I-WITH-FP-NEXT:    sw s9, 84(a5)
-; RV32I-WITH-FP-NEXT:    sw s8, 80(a5)
-; RV32I-WITH-FP-NEXT:    sw s7, 76(a5)
-; RV32I-WITH-FP-NEXT:    sw s6, 72(a5)
-; RV32I-WITH-FP-NEXT:    sw s5, 68(a5)
-; RV32I-WITH-FP-NEXT:    sw s4, 64(a5)
-; RV32I-WITH-FP-NEXT:    sw s3, 60(a5)
-; RV32I-WITH-FP-NEXT:    sw s2, 56(a5)
-; RV32I-WITH-FP-NEXT:    sw s1, 52(a5)
-; RV32I-WITH-FP-NEXT:    sw t6, 48(a5)
-; RV32I-WITH-FP-NEXT:    sw t5, 44(a5)
-; RV32I-WITH-FP-NEXT:    sw t4, 40(a5)
-; RV32I-WITH-FP-NEXT:    sw t3, 36(a5)
-; RV32I-WITH-FP-NEXT:    sw t2, 32(a5)
-; RV32I-WITH-FP-NEXT:    sw t1, 28(a5)
+; RV32I-WITH-FP-NEXT:    lw t1, 40(a2)
+; RV32I-WITH-FP-NEXT:    lw t2, 44(a2)
+; RV32I-WITH-FP-NEXT:    lw t3, 48(a2)
+; RV32I-WITH-FP-NEXT:    lw t4, 52(a2)
+; RV32I-WITH-FP-NEXT:    lw t5, 56(a2)
+; RV32I-WITH-FP-NEXT:    lw t6, 60(a2)
+; RV32I-WITH-FP-NEXT:    lw s1, 64(a2)
+; RV32I-WITH-FP-NEXT:    lw s2, 68(a2)
+; RV32I-WITH-FP-NEXT:    lw s3, 72(a2)
+; RV32I-WITH-FP-NEXT:    lw s4, 76(a2)
+; RV32I-WITH-FP-NEXT:    lw s5, 80(a2)
+; RV32I-WITH-FP-NEXT:    lw s6, 84(a2)
+; RV32I-WITH-FP-NEXT:    lw s7, 88(a2)
+; RV32I-WITH-FP-NEXT:    lw s8, 92(a2)
+; RV32I-WITH-FP-NEXT:    lw s9, 96(a2)
+; RV32I-WITH-FP-NEXT:    lw s10, 100(a2)
+; RV32I-WITH-FP-NEXT:    lw s11, 104(a2)
+; RV32I-WITH-FP-NEXT:    lw ra, 108(a2)
+; RV32I-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV32I-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV32I-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV32I-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV32I-WITH-FP-NEXT:    lw t0, %lo(var+4)(a5)
+; RV32I-WITH-FP-NEXT:    lw a7, %lo(var+8)(a5)
+; RV32I-WITH-FP-NEXT:    lw a6, %lo(var+12)(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV32I-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV32I-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV32I-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV32I-WITH-FP-NEXT:    sw ra, 108(a2)
+; RV32I-WITH-FP-NEXT:    sw s11, 104(a2)
+; RV32I-WITH-FP-NEXT:    sw s10, 100(a2)
+; RV32I-WITH-FP-NEXT:    sw s9, 96(a2)
+; RV32I-WITH-FP-NEXT:    sw s8, 92(a2)
+; RV32I-WITH-FP-NEXT:    sw s7, 88(a2)
+; RV32I-WITH-FP-NEXT:    sw s6, 84(a2)
+; RV32I-WITH-FP-NEXT:    sw s5, 80(a2)
+; RV32I-WITH-FP-NEXT:    sw s4, 76(a2)
+; RV32I-WITH-FP-NEXT:    sw s3, 72(a2)
+; RV32I-WITH-FP-NEXT:    sw s2, 68(a2)
+; RV32I-WITH-FP-NEXT:    sw s1, 64(a2)
+; RV32I-WITH-FP-NEXT:    sw t6, 60(a2)
+; RV32I-WITH-FP-NEXT:    sw t5, 56(a2)
+; RV32I-WITH-FP-NEXT:    sw t4, 52(a2)
+; RV32I-WITH-FP-NEXT:    sw t3, 48(a2)
+; RV32I-WITH-FP-NEXT:    sw t2, 44(a2)
+; RV32I-WITH-FP-NEXT:    sw t1, 40(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -80(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -76(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV32I-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV32I-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV32I-WITH-FP-NEXT:    sw a6, %lo(var+12)(a5)
+; RV32I-WITH-FP-NEXT:    sw a7, %lo(var+8)(a5)
+; RV32I-WITH-FP-NEXT:    sw t0, %lo(var+4)(a5)
 ; RV32I-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
-; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV32I-WITH-FP-NEXT:    sw a0, %lo(var)(a5)
 ; RV32I-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-WITH-FP-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -260,84 +260,84 @@ define void @callee() nounwind {
 ; RV32IZCMP-LABEL: callee:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var)
-; RV32IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var)
+; RV32IZCMP-NEXT:    lw a0, %lo(var)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var)(a5)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV32IZCMP-WITH-FP-LABEL: callee:
@@ -360,81 +360,81 @@ define void @callee() nounwind {
 ; RV32IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -56(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    addi a2, a6, %lo(var)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -60(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -64(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -68(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -72(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -76(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, -80(s0) # 4-byte Folded Spill
-; RV32IZCMP-WITH-FP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-WITH-FP-NEXT:    sw t5, 28(a5)
+; RV32IZCMP-WITH-FP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t1, 92(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw s1, 104(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a5, 108(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-WITH-FP-NEXT:    lw t4, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw t3, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    lw t2, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a5, 108(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s1, 104(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t1, 92(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t5, 40(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -80(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -76(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -72(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -68(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -64(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -60(s0) # 4-byte Folded Reload
-; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-WITH-FP-NEXT:    sw t2, %lo(var+12)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw t3, %lo(var+8)(a6)
+; RV32IZCMP-WITH-FP-NEXT:    sw t4, %lo(var+4)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    lw a0, -56(s0) # 4-byte Folded Reload
 ; RV32IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
 ; RV32IZCMP-WITH-FP-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
@@ -469,84 +469,84 @@ define void @callee() nounwind {
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var)
-; RV64I-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-NEXT:    lui a4, %hi(var)
+; RV64I-NEXT:    lw a0, %lo(var)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-NEXT:    sw a0, %lo(var)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -580,86 +580,86 @@ define void @callee() nounwind {
 ; RV64I-WITH-FP-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
 ; RV64I-WITH-FP-NEXT:    addi s0, sp, 160
-; RV64I-WITH-FP-NEXT:    lui a6, %hi(var)
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    lui a5, %hi(var)
+; RV64I-WITH-FP-NEXT:    lw a0, %lo(var)(a5)
 ; RV64I-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    addi a2, a5, %lo(var)
+; RV64I-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV64I-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -152(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV64I-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV64I-WITH-FP-NEXT:    sd a0, -160(s0) # 8-byte Folded Spill
-; RV64I-WITH-FP-NEXT:    lw t1, 28(a5)
-; RV64I-WITH-FP-NEXT:    lw t2, 32(a5)
-; RV64I-WITH-FP-NEXT:    lw t3, 36(a5)
-; RV64I-WITH-FP-NEXT:    lw t4, 40(a5)
-; RV64I-WITH-FP-NEXT:    lw t5, 44(a5)
-; RV64I-WITH-FP-NEXT:    lw t6, 48(a5)
-; RV64I-WITH-FP-NEXT:    lw s1, 52(a5)
-; RV64I-WITH-FP-NEXT:    lw s2, 56(a5)
-; RV64I-WITH-FP-NEXT:    lw s3, 60(a5)
-; RV64I-WITH-FP-NEXT:    lw s4, 64(a5)
-; RV64I-WITH-FP-NEXT:    lw s5, 68(a5)
-; RV64I-WITH-FP-NEXT:    lw s6, 72(a5)
-; RV64I-WITH-FP-NEXT:    lw s7, 76(a5)
-; RV64I-WITH-FP-NEXT:    lw s8, 80(a5)
-; RV64I-WITH-FP-NEXT:    lw s9, 84(a5)
-; RV64I-WITH-FP-NEXT:    lw s10, 88(a5)
-; RV64I-WITH-FP-NEXT:    lw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    lw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64I-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64I-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64I-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64I-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64I-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64I-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64I-WITH-FP-NEXT:    sw ra, 96(a5)
-; RV64I-WITH-FP-NEXT:    sw s11, 92(a5)
-; RV64I-WITH-FP-NEXT:    sw s10, 88(a5)
-; RV64I-WITH-FP-NEXT:    sw s9, 84(a5)
-; RV64I-WITH-FP-NEXT:    sw s8, 80(a5)
-; RV64I-WITH-FP-NEXT:    sw s7, 76(a5)
-; RV64I-WITH-FP-NEXT:    sw s6, 72(a5)
-; RV64I-WITH-FP-NEXT:    sw s5, 68(a5)
-; RV64I-WITH-FP-NEXT:    sw s4, 64(a5)
-; RV64I-WITH-FP-NEXT:    sw s3, 60(a5)
-; RV64I-WITH-FP-NEXT:    sw s2, 56(a5)
-; RV64I-WITH-FP-NEXT:    sw s1, 52(a5)
-; RV64I-WITH-FP-NEXT:    sw t6, 48(a5)
-; RV64I-WITH-FP-NEXT:    sw t5, 44(a5)
-; RV64I-WITH-FP-NEXT:    sw t4, 40(a5)
-; RV64I-WITH-FP-NEXT:    sw t3, 36(a5)
-; RV64I-WITH-FP-NEXT:    sw t2, 32(a5)
-; RV64I-WITH-FP-NEXT:    sw t1, 28(a5)
+; RV64I-WITH-FP-NEXT:    lw t1, 40(a2)
+; RV64I-WITH-FP-NEXT:    lw t2, 44(a2)
+; RV64I-WITH-FP-NEXT:    lw t3, 48(a2)
+; RV64I-WITH-FP-NEXT:    lw t4, 52(a2)
+; RV64I-WITH-FP-NEXT:    lw t5, 56(a2)
+; RV64I-WITH-FP-NEXT:    lw t6, 60(a2)
+; RV64I-WITH-FP-NEXT:    lw s1, 64(a2)
+; RV64I-WITH-FP-NEXT:    lw s2, 68(a2)
+; RV64I-WITH-FP-NEXT:    lw s3, 72(a2)
+; RV64I-WITH-FP-NEXT:    lw s4, 76(a2)
+; RV64I-WITH-FP-NEXT:    lw s5, 80(a2)
+; RV64I-WITH-FP-NEXT:    lw s6, 84(a2)
+; RV64I-WITH-FP-NEXT:    lw s7, 88(a2)
+; RV64I-WITH-FP-NEXT:    lw s8, 92(a2)
+; RV64I-WITH-FP-NEXT:    lw s9, 96(a2)
+; RV64I-WITH-FP-NEXT:    lw s10, 100(a2)
+; RV64I-WITH-FP-NEXT:    lw s11, 104(a2)
+; RV64I-WITH-FP-NEXT:    lw ra, 108(a2)
+; RV64I-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV64I-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV64I-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV64I-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV64I-WITH-FP-NEXT:    lw t0, %lo(var+4)(a5)
+; RV64I-WITH-FP-NEXT:    lw a7, %lo(var+8)(a5)
+; RV64I-WITH-FP-NEXT:    lw a6, %lo(var+12)(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV64I-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV64I-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV64I-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV64I-WITH-FP-NEXT:    sw ra, 108(a2)
+; RV64I-WITH-FP-NEXT:    sw s11, 104(a2)
+; RV64I-WITH-FP-NEXT:    sw s10, 100(a2)
+; RV64I-WITH-FP-NEXT:    sw s9, 96(a2)
+; RV64I-WITH-FP-NEXT:    sw s8, 92(a2)
+; RV64I-WITH-FP-NEXT:    sw s7, 88(a2)
+; RV64I-WITH-FP-NEXT:    sw s6, 84(a2)
+; RV64I-WITH-FP-NEXT:    sw s5, 80(a2)
+; RV64I-WITH-FP-NEXT:    sw s4, 76(a2)
+; RV64I-WITH-FP-NEXT:    sw s3, 72(a2)
+; RV64I-WITH-FP-NEXT:    sw s2, 68(a2)
+; RV64I-WITH-FP-NEXT:    sw s1, 64(a2)
+; RV64I-WITH-FP-NEXT:    sw t6, 60(a2)
+; RV64I-WITH-FP-NEXT:    sw t5, 56(a2)
+; RV64I-WITH-FP-NEXT:    sw t4, 52(a2)
+; RV64I-WITH-FP-NEXT:    sw t3, 48(a2)
+; RV64I-WITH-FP-NEXT:    sw t2, 44(a2)
+; RV64I-WITH-FP-NEXT:    sw t1, 40(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -160(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -152(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV64I-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV64I-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV64I-WITH-FP-NEXT:    sw a6, %lo(var+12)(a5)
+; RV64I-WITH-FP-NEXT:    sw a7, %lo(var+8)(a5)
+; RV64I-WITH-FP-NEXT:    sw t0, %lo(var+4)(a5)
 ; RV64I-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
-; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
+; RV64I-WITH-FP-NEXT:    sw a0, %lo(var)(a5)
 ; RV64I-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-WITH-FP-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
@@ -679,84 +679,84 @@ define void @callee() nounwind {
 ; RV64IZCMP-LABEL: callee:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var)
-; RV64IZCMP-NEXT:    lw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var)
+; RV64IZCMP-NEXT:    lw a0, %lo(var)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var)(a5)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV64IZCMP-WITH-FP-LABEL: callee:
@@ -779,81 +779,81 @@ define void @callee() nounwind {
 ; RV64IZCMP-WITH-FP-NEXT:    lui a6, %hi(var)
 ; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -112(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    addi a2, a6, %lo(var)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -120(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -128(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -136(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    addi a5, a6, %lo(var)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -144(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -152(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 24(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 36(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    sd a0, -160(s0) # 8-byte Folded Spill
-; RV64IZCMP-WITH-FP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t0, 100(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t1, 96(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s1, 92(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t4, 80(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-WITH-FP-NEXT:    sw t5, 28(a5)
+; RV64IZCMP-WITH-FP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t1, 92(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw s1, 104(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a5, 108(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a4, 112(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-WITH-FP-NEXT:    lw t4, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw t3, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    lw t2, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a4, 112(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a5, 108(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s1, 104(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t1, 92(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t5, 40(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -160(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 24(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 36(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -152(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -144(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -136(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -128(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -120(s0) # 8-byte Folded Reload
-; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var+4)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-WITH-FP-NEXT:    sw t2, %lo(var+12)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw t3, %lo(var+8)(a6)
+; RV64IZCMP-WITH-FP-NEXT:    sw t4, %lo(var+4)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    ld a0, -112(s0) # 8-byte Folded Reload
 ; RV64IZCMP-WITH-FP-NEXT:    sw a0, %lo(var)(a6)
 ; RV64IZCMP-WITH-FP-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 0e4702d13a8cd2f..3a93ac8966025c9 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a3, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a3, 8(a1)
 ; RV32I-FPELIM-NEXT:    lw a4, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a1)
-; RV32I-FPELIM-NEXT:    lw a6, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a6, 8(a0)
 ; RV32I-FPELIM-NEXT:    lw a7, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a5, a6, a5
+; RV32I-FPELIM-NEXT:    lw a1, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    xor a2, a5, a2
 ; RV32I-FPELIM-NEXT:    xor a4, a7, a4
-; RV32I-FPELIM-NEXT:    or a4, a4, a5
+; RV32I-FPELIM-NEXT:    or a2, a4, a2
+; RV32I-FPELIM-NEXT:    xor a3, a6, a3
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a2, a3, a2
-; RV32I-FPELIM-NEXT:    or a0, a2, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a4
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a3, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a3, 8(a1)
 ; RV32I-WITHFP-NEXT:    lw a4, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a1)
-; RV32I-WITHFP-NEXT:    lw a6, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a6, 8(a0)
 ; RV32I-WITHFP-NEXT:    lw a7, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a5, a6, a5
+; RV32I-WITHFP-NEXT:    lw a1, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    xor a2, a5, a2
 ; RV32I-WITHFP-NEXT:    xor a4, a7, a4
-; RV32I-WITHFP-NEXT:    or a4, a4, a5
+; RV32I-WITHFP-NEXT:    or a2, a4, a2
+; RV32I-WITHFP-NEXT:    xor a3, a6, a3
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a2, a3, a2
-; RV32I-WITHFP-NEXT:    or a0, a2, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a4
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -297,21 +297,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lw a1, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a2, 0(a7)
+; RV32I-FPELIM-NEXT:    lw a1, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a2, 8(a0)
 ; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a5, 12(a7)
+; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
+; RV32I-FPELIM-NEXT:    lw a5, 8(a7)
 ; RV32I-FPELIM-NEXT:    lw a6, 4(a7)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a7, 0(a7)
+; RV32I-FPELIM-NEXT:    xor a1, a4, a1
 ; RV32I-FPELIM-NEXT:    xor a3, a6, a3
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    xor a2, a5, a2
 ; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a2, a1
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    or a0, a0, a2
+; RV32I-FPELIM-NEXT:    or a0, a0, a1
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -322,21 +322,21 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
-; RV32I-WITHFP-NEXT:    lw a1, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a2, 0(a7)
+; RV32I-WITHFP-NEXT:    lw a1, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a2, 8(a0)
 ; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a5, 12(a7)
+; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
+; RV32I-WITHFP-NEXT:    lw a5, 8(a7)
 ; RV32I-WITHFP-NEXT:    lw a6, 4(a7)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a7, 0(a7)
+; RV32I-WITHFP-NEXT:    xor a1, a4, a1
 ; RV32I-WITHFP-NEXT:    xor a3, a6, a3
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    xor a2, a5, a2
 ; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a2, a1
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    or a0, a0, a2
+; RV32I-WITHFP-NEXT:    or a0, a0, a1
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index adf3630d2a0c9cd..69ffbb0b2511dd8 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a1)
-; RV64I-NEXT:    ld a3, 0(a0)
+; RV64I-NEXT:    ld a2, 24(a1)
+; RV64I-NEXT:    ld a3, 16(a1)
 ; RV64I-NEXT:    ld a4, 8(a1)
-; RV64I-NEXT:    ld a5, 24(a1)
-; RV64I-NEXT:    ld a6, 24(a0)
+; RV64I-NEXT:    ld a5, 24(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
 ; RV64I-NEXT:    ld a7, 8(a0)
-; RV64I-NEXT:    ld a1, 16(a1)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a5, a6, a5
+; RV64I-NEXT:    ld a1, 0(a1)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    xor a2, a5, a2
 ; RV64I-NEXT:    xor a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    xor a3, a6, a3
 ; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a2, a3, a2
-; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -161,21 +161,21 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-LABEL: callee_large_scalars_exhausted_regs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    ld a0, 8(sp)
-; RV64I-NEXT:    ld a1, 0(a0)
-; RV64I-NEXT:    ld a2, 0(a7)
+; RV64I-NEXT:    ld a1, 24(a0)
+; RV64I-NEXT:    ld a2, 16(a0)
 ; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a0)
-; RV64I-NEXT:    ld a5, 24(a7)
+; RV64I-NEXT:    ld a4, 24(a7)
+; RV64I-NEXT:    ld a5, 16(a7)
 ; RV64I-NEXT:    ld a6, 8(a7)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a4, a5, a4
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ld a7, 0(a7)
+; RV64I-NEXT:    xor a1, a4, a1
 ; RV64I-NEXT:    xor a3, a6, a3
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    xor a2, a5, a2
 ; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a2, a1
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index cb64e24128b5e37..46b7da2ddc2107a 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -302,27 +302,27 @@ define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    bgez a2, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    or a6, a4, a3
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    or a6, a1, a3
 ; RV32I-NEXT:    snez a6, a6
 ; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a7
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    snez a5, a4
+; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    add a2, a2, a4
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a4, a5, a6
+; RV32I-NEXT:    snez a5, a1
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
@@ -330,27 +330,27 @@ define i128 @abs128(i128 %x) {
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 0(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    or a6, a4, a3
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    or a6, a1, a3
 ; RV32ZBB-NEXT:    snez a6, a6
 ; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a7
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    snez a5, a4
+; RV32ZBB-NEXT:    snez a4, a4
+; RV32ZBB-NEXT:    add a2, a2, a4
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a4, a5, a6
+; RV32ZBB-NEXT:    snez a5, a1
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:  .LBB8_2:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
@@ -384,27 +384,27 @@ define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a1, 8(a1)
+; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    bgez a2, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    neg a5, a1
-; RV32I-NEXT:    or a6, a4, a3
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    or a6, a1, a3
 ; RV32I-NEXT:    snez a6, a6
 ; RV32I-NEXT:    sltu a7, a5, a6
-; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a7
-; RV32I-NEXT:    sub a1, a5, a6
-; RV32I-NEXT:    snez a5, a4
+; RV32I-NEXT:    snez a4, a4
+; RV32I-NEXT:    add a2, a2, a4
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a4, a5, a6
+; RV32I-NEXT:    snez a5, a1
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    ret
@@ -412,27 +412,27 @@ define i128 @select_abs128(i128 %x) {
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 12(a1)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
 ; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 8(a1)
+; RV32ZBB-NEXT:    lw a1, 0(a1)
 ; RV32ZBB-NEXT:    bgez a2, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    neg a5, a1
-; RV32ZBB-NEXT:    or a6, a4, a3
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    or a6, a1, a3
 ; RV32ZBB-NEXT:    snez a6, a6
 ; RV32ZBB-NEXT:    sltu a7, a5, a6
-; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a7
-; RV32ZBB-NEXT:    sub a1, a5, a6
-; RV32ZBB-NEXT:    snez a5, a4
+; RV32ZBB-NEXT:    snez a4, a4
+; RV32ZBB-NEXT:    add a2, a2, a4
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a4, a5, a6
+; RV32ZBB-NEXT:    snez a5, a1
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:    sub a3, a3, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:  .LBB9_2:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a1, 0(a0)
+; RV32ZBB-NEXT:    sw a4, 8(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 12(a0)
 ; RV32ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index fb7e4a4d103d04c..e025b715af1d9bf 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -3,15 +3,13 @@
 ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
 
 define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: udiv_i128:
-; CHECK:    call __udivti3
   %res = udiv i128 %x, %y
   ret i128 %res
 }
 
 define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
-; CHECK-LABEL: udiv_i129:
-; CHECK-NOT: call{{.*}}div
   %res = udiv i129 %x, %y
   ret i129 %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 15abc9b75883c8f..8ce5031780c8af4 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -7,14 +7,14 @@
 define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV32-LABEL: ctz_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lw a2, 12(a0)
-; RV32-NEXT:    lw a4, 8(a0)
-; RV32-NEXT:    seqz a0, a3
+; RV32-NEXT:    seqz a0, a4
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    andi a0, a0, 4
-; RV32-NEXT:    seqz a3, a4
+; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    andi a3, a3, 2
 ; RV32-NEXT:    bltu a3, a0, .LBB0_2
@@ -40,14 +40,14 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ;
 ; RV64-LABEL: ctz_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    lw a3, 16(a0)
+; RV64-NEXT:    lw a4, 0(a0)
 ; RV64-NEXT:    lw a1, 8(a0)
 ; RV64-NEXT:    lw a2, 24(a0)
-; RV64-NEXT:    lw a4, 16(a0)
-; RV64-NEXT:    seqz a0, a3
+; RV64-NEXT:    seqz a0, a4
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    andi a0, a0, 4
-; RV64-NEXT:    seqz a3, a4
+; RV64-NEXT:    seqz a3, a3
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    andi a3, a3, 2
 ; RV64-NEXT:    bltu a3, a0, .LBB0_2
diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
index 13d03c5217fb1bf..6d871dccbfcd67f 100644
--- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll
+++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll
@@ -56,15 +56,15 @@ entry:
 define void @test3(ptr %a, ptr %b) nounwind {
 ; RV32-LABEL: test3:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a2, 8(a1)
 ; RV32-NEXT:    lw a3, 12(a1)
-; RV32-NEXT:    lw a4, 8(a1)
+; RV32-NEXT:    lw a4, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lui a5, 524288
 ; RV32-NEXT:    xor a3, a3, a5
-; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a2, 8(a0)
 ; RV32-NEXT:    sw a1, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    sw a4, 4(a0)
 ; RV32-NEXT:    sw a3, 12(a0)
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index bfac15e009f00eb..0e5867800e9353e 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -222,8 +222,8 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset s1, -24
 ; RV64IFD-NEXT:    .cfi_offset s2, -32
 ; RV64IFD-NEXT:    .cfi_offset fs0, -40
-; RV64IFD-NEXT:    lhu s1, 16(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu s2, 16(a1)
 ; RV64IFD-NEXT:    lhu a1, 8(a1)
 ; RV64IFD-NEXT:    mv s0, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a1
@@ -231,23 +231,23 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fs0
-; RV64IFD-NEXT:    slli s2, a0, 16
+; RV64IFD-NEXT:    slli s1, a0, 16
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    slli a0, a0, 48
 ; RV64IFD-NEXT:    srli a0, a0, 48
-; RV64IFD-NEXT:    or s2, a0, s2
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    or s1, a0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.x.w a0, fa0
 ; RV64IFD-NEXT:    sh a0, 4(s0)
-; RV64IFD-NEXT:    sw s2, 0(s0)
+; RV64IFD-NEXT:    sw s1, 0(s0)
 ; RV64IFD-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64IFD-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -349,9 +349,9 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    .cfi_offset fs0, -48
 ; RV64IFD-NEXT:    .cfi_offset fs1, -56
 ; RV64IFD-NEXT:    .cfi_offset fs2, -64
-; RV64IFD-NEXT:    lhu s1, 24(a1)
-; RV64IFD-NEXT:    lhu s2, 0(a1)
-; RV64IFD-NEXT:    lhu s3, 8(a1)
+; RV64IFD-NEXT:    lhu s1, 0(a1)
+; RV64IFD-NEXT:    lhu s2, 8(a1)
+; RV64IFD-NEXT:    lhu s3, 24(a1)
 ; RV64IFD-NEXT:    lhu a1, 16(a1)
 ; RV64IFD-NEXT:    mv s0, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a1
@@ -359,17 +359,17 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs0, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s3
+; RV64IFD-NEXT:    fmv.w.x fa0, s2
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs1, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s2
+; RV64IFD-NEXT:    fmv.w.x fa0, s1
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    call __truncsfhf2 at plt
 ; RV64IFD-NEXT:    fmv.s fs2, fa0
-; RV64IFD-NEXT:    fmv.w.x fa0, s1
+; RV64IFD-NEXT:    fmv.w.x fa0, s3
 ; RV64IFD-NEXT:    call __extendhfsf2 at plt
 ; RV64IFD-NEXT:    call exp10f at plt
 ; RV64IFD-NEXT:    fmv.x.w s1, fs2
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index f2b7e8d26328d5b..33ab4fbaaf66e8e 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1167,28 +1167,28 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
-; RV32IM-NEXT:    lw a3, 8(a1)
-; RV32IM-NEXT:    lw a4, 0(a1)
+; RV32IM-NEXT:    lw a2, 0(a1)
+; RV32IM-NEXT:    lw a3, 12(a1)
+; RV32IM-NEXT:    lw a4, 8(a1)
 ; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    li a5, -15
 ; RV32IM-NEXT:    slli a5, a5, 8
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    mul a7, a1, a5
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
 ; RV32IM-NEXT:    mulhu t0, a1, a5
 ; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a4
-; RV32IM-NEXT:    neg t0, a4
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    neg t0, a2
 ; RV32IM-NEXT:    sltu t1, a6, t0
 ; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a4, t2
+; RV32IM-NEXT:    mulhu t3, a2, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
 ; RV32IM-NEXT:    sub t4, t1, a1
-; RV32IM-NEXT:    mul t5, a3, a5
-; RV32IM-NEXT:    sub t5, t5, a4
+; RV32IM-NEXT:    mul t5, a4, a5
+; RV32IM-NEXT:    sub t5, t5, a2
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
 ; RV32IM-NEXT:    neg s1, a1
@@ -1198,17 +1198,17 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    mulhu t1, a3, a5
-; RV32IM-NEXT:    sub a3, t1, a3
-; RV32IM-NEXT:    add a2, a3, a2
-; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    mul a3, a3, a5
+; RV32IM-NEXT:    mulhu t1, a4, a5
+; RV32IM-NEXT:    sub a4, t1, a4
+; RV32IM-NEXT:    add a3, a4, a3
+; RV32IM-NEXT:    add a1, a2, a1
 ; RV32IM-NEXT:    sub a1, t3, a1
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a1, a1, a3
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    mul a2, a4, a5
+; RV32IM-NEXT:    mul a2, a2, a5
 ; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
 ; RV32IM-NEXT:    sw t6, 8(a0)
@@ -1292,17 +1292,17 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 12(a1)
+; RV32IM-NEXT:    lw a2, 4(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
-; RV32IM-NEXT:    lw a4, 4(a1)
+; RV32IM-NEXT:    lw a4, 12(a1)
 ; RV32IM-NEXT:    lw a1, 8(a1)
 ; RV32IM-NEXT:    li a5, -63
 ; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    slli a7, a4, 6
-; RV32IM-NEXT:    sub a7, a4, a7
+; RV32IM-NEXT:    slli a7, a2, 6
+; RV32IM-NEXT:    sub a7, a2, a7
 ; RV32IM-NEXT:    add a6, a7, a6
 ; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a4, a5
+; RV32IM-NEXT:    mulhu t0, a2, a5
 ; RV32IM-NEXT:    add a7, t0, a7
 ; RV32IM-NEXT:    sub a6, a6, a3
 ; RV32IM-NEXT:    neg t0, a3
@@ -1311,27 +1311,27 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    mulhu t3, a3, t2
 ; RV32IM-NEXT:    add t1, t3, t1
 ; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a4
+; RV32IM-NEXT:    sub t4, t1, a2
 ; RV32IM-NEXT:    slli t5, a1, 6
 ; RV32IM-NEXT:    sub t6, a1, a3
 ; RV32IM-NEXT:    sub t5, t6, t5
 ; RV32IM-NEXT:    add t6, t4, t5
 ; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a4
+; RV32IM-NEXT:    neg s1, a2
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a4, t2
+; RV32IM-NEXT:    mulhu t1, a2, t2
 ; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a2, 6
-; RV32IM-NEXT:    sub a2, a2, t1
+; RV32IM-NEXT:    slli t1, a4, 6
+; RV32IM-NEXT:    sub a4, a4, t1
 ; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    sub a5, a5, a1
-; RV32IM-NEXT:    add a2, a5, a2
-; RV32IM-NEXT:    add a4, a3, a4
-; RV32IM-NEXT:    sub a1, t3, a4
-; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a4, a5, a4
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    sub a1, t3, a2
+; RV32IM-NEXT:    add a1, a1, a4
 ; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
index 4c5c36fc72d14db..f2421970340b56a 100644
--- a/llvm/test/CodeGen/RISCV/nontemporal.ll
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -907,54 +907,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -968,54 +968,54 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1029,44 +1029,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -1090,44 +1090,44 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -1163,112 +1163,112 @@ define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -2321,54 +2321,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -2382,54 +2382,54 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -2443,44 +2443,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
@@ -2504,44 +2504,44 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
@@ -2577,112 +2577,112 @@ define void @test_nontemporal_P1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_P1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.p1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.p1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.p1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_P1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.p1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -3735,54 +3735,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -3796,54 +3796,54 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -3857,44 +3857,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
@@ -3918,44 +3918,44 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
@@ -3991,112 +3991,112 @@ define void @test_nontemporal_PALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_PALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.pall
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.pall
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.pall
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_PALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.pall
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -5149,54 +5149,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -5210,54 +5210,54 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -5271,44 +5271,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
@@ -5332,44 +5332,44 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
@@ -5405,112 +5405,112 @@ define void @test_nontemporal_S1_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_S1_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.s1
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.s1
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.s1
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_S1_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.s1
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
@@ -6563,54 +6563,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64-NEXT:    .cfi_offset s1, -16
-; CHECK-RV64-NEXT:    lbu a2, 0(a1)
-; CHECK-RV64-NEXT:    lbu a3, 8(a1)
-; CHECK-RV64-NEXT:    lbu a4, 16(a1)
-; CHECK-RV64-NEXT:    lbu a5, 24(a1)
-; CHECK-RV64-NEXT:    lbu a6, 32(a1)
-; CHECK-RV64-NEXT:    lbu a7, 40(a1)
-; CHECK-RV64-NEXT:    lbu t0, 48(a1)
-; CHECK-RV64-NEXT:    lbu t1, 56(a1)
-; CHECK-RV64-NEXT:    lbu t2, 64(a1)
-; CHECK-RV64-NEXT:    lbu t3, 72(a1)
-; CHECK-RV64-NEXT:    lbu t4, 80(a1)
-; CHECK-RV64-NEXT:    lbu t5, 88(a1)
-; CHECK-RV64-NEXT:    lbu t6, 120(a1)
-; CHECK-RV64-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64-NEXT:    lbu a2, 104(a1)
+; CHECK-RV64-NEXT:    lbu a3, 112(a1)
+; CHECK-RV64-NEXT:    lbu a4, 120(a1)
+; CHECK-RV64-NEXT:    lbu a5, 0(a1)
+; CHECK-RV64-NEXT:    lbu a6, 8(a1)
+; CHECK-RV64-NEXT:    lbu a7, 16(a1)
+; CHECK-RV64-NEXT:    lbu t0, 24(a1)
+; CHECK-RV64-NEXT:    lbu t1, 32(a1)
+; CHECK-RV64-NEXT:    lbu t2, 40(a1)
+; CHECK-RV64-NEXT:    lbu t3, 48(a1)
+; CHECK-RV64-NEXT:    lbu t4, 56(a1)
+; CHECK-RV64-NEXT:    lbu t5, 64(a1)
+; CHECK-RV64-NEXT:    lbu t6, 72(a1)
+; CHECK-RV64-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t6, 15(a0)
+; CHECK-RV64-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s0, 14(a0)
+; CHECK-RV64-NEXT:    sb a3, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb s1, 13(a0)
+; CHECK-RV64-NEXT:    sb a2, 13(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t5, 11(a0)
+; CHECK-RV64-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t4, 10(a0)
+; CHECK-RV64-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t3, 9(a0)
+; CHECK-RV64-NEXT:    sb t6, 9(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t2, 8(a0)
+; CHECK-RV64-NEXT:    sb t5, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t1, 7(a0)
+; CHECK-RV64-NEXT:    sb t4, 7(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb t0, 6(a0)
+; CHECK-RV64-NEXT:    sb t3, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a7, 5(a0)
+; CHECK-RV64-NEXT:    sb t2, 5(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a6, 4(a0)
+; CHECK-RV64-NEXT:    sb t1, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a5, 3(a0)
+; CHECK-RV64-NEXT:    sb t0, 3(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a4, 2(a0)
+; CHECK-RV64-NEXT:    sb a7, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a3, 1(a0)
+; CHECK-RV64-NEXT:    sb a6, 1(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sb a2, 0(a0)
+; CHECK-RV64-NEXT:    sb a5, 0(a0)
 ; CHECK-RV64-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -6624,54 +6624,54 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32-NEXT:    .cfi_offset s1, -8
-; CHECK-RV32-NEXT:    lbu a2, 0(a1)
-; CHECK-RV32-NEXT:    lbu a3, 4(a1)
-; CHECK-RV32-NEXT:    lbu a4, 8(a1)
-; CHECK-RV32-NEXT:    lbu a5, 12(a1)
-; CHECK-RV32-NEXT:    lbu a6, 16(a1)
-; CHECK-RV32-NEXT:    lbu a7, 20(a1)
-; CHECK-RV32-NEXT:    lbu t0, 24(a1)
-; CHECK-RV32-NEXT:    lbu t1, 28(a1)
-; CHECK-RV32-NEXT:    lbu t2, 32(a1)
-; CHECK-RV32-NEXT:    lbu t3, 36(a1)
-; CHECK-RV32-NEXT:    lbu t4, 40(a1)
-; CHECK-RV32-NEXT:    lbu t5, 44(a1)
-; CHECK-RV32-NEXT:    lbu t6, 60(a1)
-; CHECK-RV32-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32-NEXT:    lbu a2, 52(a1)
+; CHECK-RV32-NEXT:    lbu a3, 56(a1)
+; CHECK-RV32-NEXT:    lbu a4, 60(a1)
+; CHECK-RV32-NEXT:    lbu a5, 0(a1)
+; CHECK-RV32-NEXT:    lbu a6, 4(a1)
+; CHECK-RV32-NEXT:    lbu a7, 8(a1)
+; CHECK-RV32-NEXT:    lbu t0, 12(a1)
+; CHECK-RV32-NEXT:    lbu t1, 16(a1)
+; CHECK-RV32-NEXT:    lbu t2, 20(a1)
+; CHECK-RV32-NEXT:    lbu t3, 24(a1)
+; CHECK-RV32-NEXT:    lbu t4, 28(a1)
+; CHECK-RV32-NEXT:    lbu t5, 32(a1)
+; CHECK-RV32-NEXT:    lbu t6, 36(a1)
+; CHECK-RV32-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t6, 15(a0)
+; CHECK-RV32-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s0, 14(a0)
+; CHECK-RV32-NEXT:    sb a3, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb s1, 13(a0)
+; CHECK-RV32-NEXT:    sb a2, 13(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t5, 11(a0)
+; CHECK-RV32-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t4, 10(a0)
+; CHECK-RV32-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t3, 9(a0)
+; CHECK-RV32-NEXT:    sb t6, 9(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t2, 8(a0)
+; CHECK-RV32-NEXT:    sb t5, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t1, 7(a0)
+; CHECK-RV32-NEXT:    sb t4, 7(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb t0, 6(a0)
+; CHECK-RV32-NEXT:    sb t3, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a7, 5(a0)
+; CHECK-RV32-NEXT:    sb t2, 5(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a6, 4(a0)
+; CHECK-RV32-NEXT:    sb t1, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a5, 3(a0)
+; CHECK-RV32-NEXT:    sb t0, 3(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a4, 2(a0)
+; CHECK-RV32-NEXT:    sb a7, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a3, 1(a0)
+; CHECK-RV32-NEXT:    sb a6, 1(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sb a2, 0(a0)
+; CHECK-RV32-NEXT:    sb a5, 0(a0)
 ; CHECK-RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -6685,44 +6685,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV64C-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
 ; CHECK-RV64C-NEXT:    .cfi_offset s0, -8
 ; CHECK-RV64C-NEXT:    .cfi_offset s1, -16
+; CHECK-RV64C-NEXT:    lbu t3, 104(a1)
+; CHECK-RV64C-NEXT:    lbu t6, 112(a1)
+; CHECK-RV64C-NEXT:    lbu a4, 120(a1)
 ; CHECK-RV64C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV64C-NEXT:    lbu a7, 8(a1)
 ; CHECK-RV64C-NEXT:    lbu t0, 16(a1)
 ; CHECK-RV64C-NEXT:    lbu t1, 24(a1)
 ; CHECK-RV64C-NEXT:    lbu t2, 32(a1)
-; CHECK-RV64C-NEXT:    lbu t3, 40(a1)
-; CHECK-RV64C-NEXT:    lbu t4, 48(a1)
-; CHECK-RV64C-NEXT:    lbu t5, 56(a1)
-; CHECK-RV64C-NEXT:    lbu t6, 64(a1)
+; CHECK-RV64C-NEXT:    lbu t4, 40(a1)
+; CHECK-RV64C-NEXT:    lbu t5, 48(a1)
+; CHECK-RV64C-NEXT:    lbu a5, 56(a1)
+; CHECK-RV64C-NEXT:    lbu a2, 64(a1)
 ; CHECK-RV64C-NEXT:    lbu a3, 72(a1)
-; CHECK-RV64C-NEXT:    lbu a4, 80(a1)
-; CHECK-RV64C-NEXT:    lbu a5, 88(a1)
-; CHECK-RV64C-NEXT:    lbu a2, 120(a1)
-; CHECK-RV64C-NEXT:    lbu s0, 112(a1)
-; CHECK-RV64C-NEXT:    lbu s1, 104(a1)
+; CHECK-RV64C-NEXT:    lbu s0, 80(a1)
+; CHECK-RV64C-NEXT:    lbu s1, 88(a1)
 ; CHECK-RV64C-NEXT:    lbu a1, 96(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a2, 15(a0)
+; CHECK-RV64C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s0, 14(a0)
+; CHECK-RV64C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb s1, 13(a0)
+; CHECK-RV64C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a5, 11(a0)
+; CHECK-RV64C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb a4, 10(a0)
+; CHECK-RV64C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t6, 8(a0)
+; CHECK-RV64C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t5, 7(a0)
+; CHECK-RV64C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t4, 6(a0)
+; CHECK-RV64C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sb t3, 5(a0)
+; CHECK-RV64C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
@@ -6746,44 +6746,44 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 ; CHECK-RV32C-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32C-NEXT:    .cfi_offset s0, -4
 ; CHECK-RV32C-NEXT:    .cfi_offset s1, -8
+; CHECK-RV32C-NEXT:    lbu t3, 52(a1)
+; CHECK-RV32C-NEXT:    lbu t6, 56(a1)
+; CHECK-RV32C-NEXT:    lbu a4, 60(a1)
 ; CHECK-RV32C-NEXT:    lbu a6, 0(a1)
 ; CHECK-RV32C-NEXT:    lbu a7, 4(a1)
 ; CHECK-RV32C-NEXT:    lbu t0, 8(a1)
 ; CHECK-RV32C-NEXT:    lbu t1, 12(a1)
 ; CHECK-RV32C-NEXT:    lbu t2, 16(a1)
-; CHECK-RV32C-NEXT:    lbu t3, 20(a1)
-; CHECK-RV32C-NEXT:    lbu t4, 24(a1)
-; CHECK-RV32C-NEXT:    lbu t5, 28(a1)
-; CHECK-RV32C-NEXT:    lbu t6, 32(a1)
+; CHECK-RV32C-NEXT:    lbu t4, 20(a1)
+; CHECK-RV32C-NEXT:    lbu t5, 24(a1)
+; CHECK-RV32C-NEXT:    lbu a5, 28(a1)
+; CHECK-RV32C-NEXT:    lbu a2, 32(a1)
 ; CHECK-RV32C-NEXT:    lbu a3, 36(a1)
-; CHECK-RV32C-NEXT:    lbu a4, 40(a1)
-; CHECK-RV32C-NEXT:    lbu a5, 44(a1)
-; CHECK-RV32C-NEXT:    lbu a2, 60(a1)
-; CHECK-RV32C-NEXT:    lbu s0, 56(a1)
-; CHECK-RV32C-NEXT:    lbu s1, 52(a1)
+; CHECK-RV32C-NEXT:    lbu s0, 40(a1)
+; CHECK-RV32C-NEXT:    lbu s1, 44(a1)
 ; CHECK-RV32C-NEXT:    lbu a1, 48(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a2, 15(a0)
+; CHECK-RV32C-NEXT:    sb a4, 15(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s0, 14(a0)
+; CHECK-RV32C-NEXT:    sb t6, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb s1, 13(a0)
+; CHECK-RV32C-NEXT:    sb t3, 13(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a1, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a5, 11(a0)
+; CHECK-RV32C-NEXT:    sb s1, 11(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb a4, 10(a0)
+; CHECK-RV32C-NEXT:    sb s0, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb a3, 9(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t6, 8(a0)
+; CHECK-RV32C-NEXT:    sb a2, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t5, 7(a0)
+; CHECK-RV32C-NEXT:    sb a5, 7(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t4, 6(a0)
+; CHECK-RV32C-NEXT:    sb t5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sb t3, 5(a0)
+; CHECK-RV32C-NEXT:    sb t4, 5(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sb t2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
@@ -6819,112 +6819,112 @@ define void @test_nontemporal_ALL_store_v16i8(ptr %p, <16 x i8> %v) {
 define void @test_nontemporal_ALL_store_v8i16(ptr %p, <8 x i16> %v) {
 ; CHECK-RV64-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    lh a2, 0(a1)
-; CHECK-RV64-NEXT:    lh a3, 8(a1)
-; CHECK-RV64-NEXT:    lh a4, 16(a1)
-; CHECK-RV64-NEXT:    lh a5, 24(a1)
-; CHECK-RV64-NEXT:    lh a6, 56(a1)
-; CHECK-RV64-NEXT:    lh a7, 48(a1)
-; CHECK-RV64-NEXT:    lh t0, 40(a1)
+; CHECK-RV64-NEXT:    lh a2, 40(a1)
+; CHECK-RV64-NEXT:    lh a3, 48(a1)
+; CHECK-RV64-NEXT:    lh a4, 56(a1)
+; CHECK-RV64-NEXT:    lh a5, 0(a1)
+; CHECK-RV64-NEXT:    lh a6, 8(a1)
+; CHECK-RV64-NEXT:    lh a7, 16(a1)
+; CHECK-RV64-NEXT:    lh t0, 24(a1)
 ; CHECK-RV64-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a6, 14(a0)
+; CHECK-RV64-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a7, 12(a0)
+; CHECK-RV64-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh t0, 10(a0)
+; CHECK-RV64-NEXT:    sh a2, 10(a0)
 ; CHECK-RV64-NEXT:    ntl.all
 ; CHECK-RV64-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a5, 6(a0)
+; CHECK-RV64-NEXT:    sh t0, 6(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a4, 4(a0)
+; CHECK-RV64-NEXT:    sh a7, 4(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a3, 2(a0)
+; CHECK-RV64-NEXT:    sh a6, 2(a0)
 ; CHECK-RV64-NEXT:    ntl.all
-; CHECK-RV64-NEXT:    sh a2, 0(a0)
+; CHECK-RV64-NEXT:    sh a5, 0(a0)
 ; CHECK-RV64-NEXT:    ret
 ;
 ; CHECK-RV32-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    lh a2, 0(a1)
-; CHECK-RV32-NEXT:    lh a3, 4(a1)
-; CHECK-RV32-NEXT:    lh a4, 8(a1)
-; CHECK-RV32-NEXT:    lh a5, 12(a1)
-; CHECK-RV32-NEXT:    lh a6, 28(a1)
-; CHECK-RV32-NEXT:    lh a7, 24(a1)
-; CHECK-RV32-NEXT:    lh t0, 20(a1)
+; CHECK-RV32-NEXT:    lh a2, 20(a1)
+; CHECK-RV32-NEXT:    lh a3, 24(a1)
+; CHECK-RV32-NEXT:    lh a4, 28(a1)
+; CHECK-RV32-NEXT:    lh a5, 0(a1)
+; CHECK-RV32-NEXT:    lh a6, 4(a1)
+; CHECK-RV32-NEXT:    lh a7, 8(a1)
+; CHECK-RV32-NEXT:    lh t0, 12(a1)
 ; CHECK-RV32-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a6, 14(a0)
+; CHECK-RV32-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a7, 12(a0)
+; CHECK-RV32-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh t0, 10(a0)
+; CHECK-RV32-NEXT:    sh a2, 10(a0)
 ; CHECK-RV32-NEXT:    ntl.all
 ; CHECK-RV32-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a5, 6(a0)
+; CHECK-RV32-NEXT:    sh t0, 6(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a4, 4(a0)
+; CHECK-RV32-NEXT:    sh a7, 4(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a3, 2(a0)
+; CHECK-RV32-NEXT:    sh a6, 2(a0)
 ; CHECK-RV32-NEXT:    ntl.all
-; CHECK-RV32-NEXT:    sh a2, 0(a0)
+; CHECK-RV32-NEXT:    sh a5, 0(a0)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV64C:       # %bb.0:
+; CHECK-RV64C-NEXT:    lh a7, 40(a1)
+; CHECK-RV64C-NEXT:    lh a3, 48(a1)
+; CHECK-RV64C-NEXT:    lh a4, 56(a1)
 ; CHECK-RV64C-NEXT:    lh a6, 0(a1)
-; CHECK-RV64C-NEXT:    lh a7, 8(a1)
-; CHECK-RV64C-NEXT:    lh t0, 16(a1)
+; CHECK-RV64C-NEXT:    lh t0, 8(a1)
+; CHECK-RV64C-NEXT:    lh a2, 16(a1)
 ; CHECK-RV64C-NEXT:    lh a5, 24(a1)
-; CHECK-RV64C-NEXT:    lh a2, 56(a1)
-; CHECK-RV64C-NEXT:    lh a3, 48(a1)
-; CHECK-RV64C-NEXT:    lh a4, 40(a1)
 ; CHECK-RV64C-NEXT:    lh a1, 32(a1)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a2, 14(a0)
+; CHECK-RV64C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a4, 10(a0)
+; CHECK-RV64C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh t0, 4(a0)
+; CHECK-RV64C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
-; CHECK-RV64C-NEXT:    sh a7, 2(a0)
+; CHECK-RV64C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV64C-NEXT:    c.ntl.all
 ; CHECK-RV64C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV64C-NEXT:    ret
 ;
 ; CHECK-RV32C-LABEL: test_nontemporal_ALL_store_v8i16:
 ; CHECK-RV32C:       # %bb.0:
+; CHECK-RV32C-NEXT:    lh a7, 20(a1)
+; CHECK-RV32C-NEXT:    lh a3, 24(a1)
+; CHECK-RV32C-NEXT:    lh a4, 28(a1)
 ; CHECK-RV32C-NEXT:    lh a6, 0(a1)
-; CHECK-RV32C-NEXT:    lh a7, 4(a1)
-; CHECK-RV32C-NEXT:    lh t0, 8(a1)
+; CHECK-RV32C-NEXT:    lh t0, 4(a1)
+; CHECK-RV32C-NEXT:    lh a2, 8(a1)
 ; CHECK-RV32C-NEXT:    lh a5, 12(a1)
-; CHECK-RV32C-NEXT:    lh a2, 28(a1)
-; CHECK-RV32C-NEXT:    lh a3, 24(a1)
-; CHECK-RV32C-NEXT:    lh a4, 20(a1)
 ; CHECK-RV32C-NEXT:    lh a1, 16(a1)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a2, 14(a0)
+; CHECK-RV32C-NEXT:    sh a4, 14(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a3, 12(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a4, 10(a0)
+; CHECK-RV32C-NEXT:    sh a7, 10(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a1, 8(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a5, 6(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh t0, 4(a0)
+; CHECK-RV32C-NEXT:    sh a2, 4(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
-; CHECK-RV32C-NEXT:    sh a7, 2(a0)
+; CHECK-RV32C-NEXT:    sh t0, 2(a0)
 ; CHECK-RV32C-NEXT:    c.ntl.all
 ; CHECK-RV32C-NEXT:    sh a6, 0(a0)
 ; CHECK-RV32C-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
index 776944b177636c2..4521b07fd862c6a 100644
--- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll
+++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll
@@ -1133,41 +1133,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-NEXT:    lw a6, %lo(var0)(a0)
-; RV32IZCMP-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV32IZCMP-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV32IZCMP-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV32IZCMP-NEXT:    addi a5, a0, %lo(var0)
-; RV32IZCMP-NEXT:    lw t2, 16(a5)
-; RV32IZCMP-NEXT:    lw t3, 20(a5)
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
-; RV32IZCMP-NEXT:    sw t3, 20(a5)
-; RV32IZCMP-NEXT:    sw t2, 16(a5)
-; RV32IZCMP-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV32IZCMP-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV32IZCMP-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV32IZCMP-NEXT:    addi a2, a0, %lo(var0)
+; RV32IZCMP-NEXT:    lw a7, 16(a2)
+; RV32IZCMP-NEXT:    lw t0, 20(a2)
+; RV32IZCMP-NEXT:    lw t1, 24(a2)
+; RV32IZCMP-NEXT:    lw t2, 28(a2)
+; RV32IZCMP-NEXT:    lw t3, 32(a2)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw a3, 48(a2)
+; RV32IZCMP-NEXT:    lw a4, 52(a2)
+; RV32IZCMP-NEXT:    lw a5, 56(a2)
+; RV32IZCMP-NEXT:    lw a1, 60(a2)
+; RV32IZCMP-NEXT:    lw s0, 64(a2)
+; RV32IZCMP-NEXT:    lw s1, 68(a2)
+; RV32IZCMP-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32IZCMP-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32IZCMP-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32IZCMP-NEXT:    sw s1, 68(a2)
+; RV32IZCMP-NEXT:    sw s0, 64(a2)
+; RV32IZCMP-NEXT:    sw a1, 60(a2)
+; RV32IZCMP-NEXT:    sw a5, 56(a2)
+; RV32IZCMP-NEXT:    sw a4, 52(a2)
+; RV32IZCMP-NEXT:    sw a3, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
+; RV32IZCMP-NEXT:    sw t3, 32(a2)
+; RV32IZCMP-NEXT:    sw t2, 28(a2)
+; RV32IZCMP-NEXT:    sw t1, 24(a2)
+; RV32IZCMP-NEXT:    sw t0, 20(a2)
+; RV32IZCMP-NEXT:    sw a7, 16(a2)
+; RV32IZCMP-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32IZCMP-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32IZCMP-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32IZCMP-NEXT:    sw a6, %lo(var0)(a0)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
@@ -1176,41 +1176,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-NEXT:    lw a6, %lo(var0)(a0)
-; RV64IZCMP-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV64IZCMP-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV64IZCMP-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV64IZCMP-NEXT:    addi a5, a0, %lo(var0)
-; RV64IZCMP-NEXT:    lw t2, 16(a5)
-; RV64IZCMP-NEXT:    lw t3, 20(a5)
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
-; RV64IZCMP-NEXT:    sw t3, 20(a5)
-; RV64IZCMP-NEXT:    sw t2, 16(a5)
-; RV64IZCMP-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV64IZCMP-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV64IZCMP-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV64IZCMP-NEXT:    addi a2, a0, %lo(var0)
+; RV64IZCMP-NEXT:    lw a7, 16(a2)
+; RV64IZCMP-NEXT:    lw t0, 20(a2)
+; RV64IZCMP-NEXT:    lw t1, 24(a2)
+; RV64IZCMP-NEXT:    lw t2, 28(a2)
+; RV64IZCMP-NEXT:    lw t3, 32(a2)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw a3, 48(a2)
+; RV64IZCMP-NEXT:    lw a4, 52(a2)
+; RV64IZCMP-NEXT:    lw a5, 56(a2)
+; RV64IZCMP-NEXT:    lw a1, 60(a2)
+; RV64IZCMP-NEXT:    lw s0, 64(a2)
+; RV64IZCMP-NEXT:    lw s1, 68(a2)
+; RV64IZCMP-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64IZCMP-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64IZCMP-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64IZCMP-NEXT:    sw s1, 68(a2)
+; RV64IZCMP-NEXT:    sw s0, 64(a2)
+; RV64IZCMP-NEXT:    sw a1, 60(a2)
+; RV64IZCMP-NEXT:    sw a5, 56(a2)
+; RV64IZCMP-NEXT:    sw a4, 52(a2)
+; RV64IZCMP-NEXT:    sw a3, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
+; RV64IZCMP-NEXT:    sw t3, 32(a2)
+; RV64IZCMP-NEXT:    sw t2, 28(a2)
+; RV64IZCMP-NEXT:    sw t1, 24(a2)
+; RV64IZCMP-NEXT:    sw t0, 20(a2)
+; RV64IZCMP-NEXT:    sw a7, 16(a2)
+; RV64IZCMP-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64IZCMP-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64IZCMP-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64IZCMP-NEXT:    sw a6, %lo(var0)(a0)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
@@ -1219,41 +1219,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -32
 ; RV32IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV32IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
-; RV32IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV32IZCMP-SR-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV32IZCMP-SR-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV32IZCMP-SR-NEXT:    addi a5, a0, %lo(var0)
-; RV32IZCMP-SR-NEXT:    lw t2, 16(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 20(a5)
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 20(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 16(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV32IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV32IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV32IZCMP-SR-NEXT:    addi a2, a0, %lo(var0)
+; RV32IZCMP-SR-NEXT:    lw a7, 16(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 20(a2)
+; RV32IZCMP-SR-NEXT:    lw t1, 24(a2)
+; RV32IZCMP-SR-NEXT:    lw t2, 28(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, 32(a2)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw a5, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32IZCMP-SR-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32IZCMP-SR-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32IZCMP-SR-NEXT:    sw s1, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw a5, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    sw t3, 32(a2)
+; RV32IZCMP-SR-NEXT:    sw t2, 28(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, 24(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 20(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32IZCMP-SR-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32IZCMP-SR-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 32
 ;
@@ -1262,41 +1262,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s4}, -48
 ; RV64IZCMP-SR-NEXT:    lui a0, %hi(var0)
 ; RV64IZCMP-SR-NEXT:    lw a6, %lo(var0)(a0)
-; RV64IZCMP-SR-NEXT:    lw a7, %lo(var0+4)(a0)
-; RV64IZCMP-SR-NEXT:    lw t0, %lo(var0+8)(a0)
-; RV64IZCMP-SR-NEXT:    lw t1, %lo(var0+12)(a0)
-; RV64IZCMP-SR-NEXT:    addi a5, a0, %lo(var0)
-; RV64IZCMP-SR-NEXT:    lw t2, 16(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 20(a5)
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 20(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 16(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, %lo(var0+12)(a0)
-; RV64IZCMP-SR-NEXT:    sw t0, %lo(var0+8)(a0)
-; RV64IZCMP-SR-NEXT:    sw a7, %lo(var0+4)(a0)
+; RV64IZCMP-SR-NEXT:    addi a2, a0, %lo(var0)
+; RV64IZCMP-SR-NEXT:    lw a7, 16(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 20(a2)
+; RV64IZCMP-SR-NEXT:    lw t1, 24(a2)
+; RV64IZCMP-SR-NEXT:    lw t2, 28(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, 32(a2)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw a5, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64IZCMP-SR-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64IZCMP-SR-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64IZCMP-SR-NEXT:    sw s1, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw a5, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    sw t3, 32(a2)
+; RV64IZCMP-SR-NEXT:    sw t2, 28(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, 24(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 20(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64IZCMP-SR-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64IZCMP-SR-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64IZCMP-SR-NEXT:    sw a6, %lo(var0)(a0)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s4}, 48
 ;
@@ -1310,41 +1310,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV32I-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(var0)
 ; RV32I-NEXT:    lw a1, %lo(var0)(a0)
-; RV32I-NEXT:    lw a2, %lo(var0+4)(a0)
-; RV32I-NEXT:    lw a3, %lo(var0+8)(a0)
-; RV32I-NEXT:    lw a4, %lo(var0+12)(a0)
-; RV32I-NEXT:    addi a5, a0, %lo(var0)
-; RV32I-NEXT:    lw a6, 16(a5)
-; RV32I-NEXT:    lw a7, 20(a5)
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 68(a5)
-; RV32I-NEXT:    lw s2, 64(a5)
-; RV32I-NEXT:    lw s3, 60(a5)
-; RV32I-NEXT:    lw s4, 56(a5)
-; RV32I-NEXT:    sw s1, 68(a5)
-; RV32I-NEXT:    sw s2, 64(a5)
-; RV32I-NEXT:    sw s3, 60(a5)
-; RV32I-NEXT:    sw s4, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
-; RV32I-NEXT:    sw a7, 20(a5)
-; RV32I-NEXT:    sw a6, 16(a5)
-; RV32I-NEXT:    sw a4, %lo(var0+12)(a0)
-; RV32I-NEXT:    sw a3, %lo(var0+8)(a0)
-; RV32I-NEXT:    sw a2, %lo(var0+4)(a0)
+; RV32I-NEXT:    addi a2, a0, %lo(var0)
+; RV32I-NEXT:    lw a3, 16(a2)
+; RV32I-NEXT:    lw a4, 20(a2)
+; RV32I-NEXT:    lw a5, 24(a2)
+; RV32I-NEXT:    lw a6, 28(a2)
+; RV32I-NEXT:    lw a7, 32(a2)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV32I-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV32I-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
+; RV32I-NEXT:    sw a7, 32(a2)
+; RV32I-NEXT:    sw a6, 28(a2)
+; RV32I-NEXT:    sw a5, 24(a2)
+; RV32I-NEXT:    sw a4, 20(a2)
+; RV32I-NEXT:    sw a3, 16(a2)
+; RV32I-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV32I-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV32I-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV32I-NEXT:    sw a1, %lo(var0)(a0)
 ; RV32I-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
@@ -1364,41 +1364,41 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) nounwind {
 ; RV64I-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lui a0, %hi(var0)
 ; RV64I-NEXT:    lw a1, %lo(var0)(a0)
-; RV64I-NEXT:    lw a2, %lo(var0+4)(a0)
-; RV64I-NEXT:    lw a3, %lo(var0+8)(a0)
-; RV64I-NEXT:    lw a4, %lo(var0+12)(a0)
-; RV64I-NEXT:    addi a5, a0, %lo(var0)
-; RV64I-NEXT:    lw a6, 16(a5)
-; RV64I-NEXT:    lw a7, 20(a5)
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 68(a5)
-; RV64I-NEXT:    lw s2, 64(a5)
-; RV64I-NEXT:    lw s3, 60(a5)
-; RV64I-NEXT:    lw s4, 56(a5)
-; RV64I-NEXT:    sw s1, 68(a5)
-; RV64I-NEXT:    sw s2, 64(a5)
-; RV64I-NEXT:    sw s3, 60(a5)
-; RV64I-NEXT:    sw s4, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
-; RV64I-NEXT:    sw a7, 20(a5)
-; RV64I-NEXT:    sw a6, 16(a5)
-; RV64I-NEXT:    sw a4, %lo(var0+12)(a0)
-; RV64I-NEXT:    sw a3, %lo(var0+8)(a0)
-; RV64I-NEXT:    sw a2, %lo(var0+4)(a0)
+; RV64I-NEXT:    addi a2, a0, %lo(var0)
+; RV64I-NEXT:    lw a3, 16(a2)
+; RV64I-NEXT:    lw a4, 20(a2)
+; RV64I-NEXT:    lw a5, 24(a2)
+; RV64I-NEXT:    lw a6, 28(a2)
+; RV64I-NEXT:    lw a7, 32(a2)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, %lo(var0+4)(a0)
+; RV64I-NEXT:    lw s3, %lo(var0+8)(a0)
+; RV64I-NEXT:    lw s4, %lo(var0+12)(a0)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
+; RV64I-NEXT:    sw a7, 32(a2)
+; RV64I-NEXT:    sw a6, 28(a2)
+; RV64I-NEXT:    sw a5, 24(a2)
+; RV64I-NEXT:    sw a4, 20(a2)
+; RV64I-NEXT:    sw a3, 16(a2)
+; RV64I-NEXT:    sw s4, %lo(var0+12)(a0)
+; RV64I-NEXT:    sw s3, %lo(var0+8)(a0)
+; RV64I-NEXT:    sw s2, %lo(var0+4)(a0)
 ; RV64I-NEXT:    sw a1, %lo(var0)(a0)
 ; RV64I-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
@@ -1837,84 +1837,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -1953,84 +1953,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2069,84 +2069,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32IZCMP-SR-NEXT:    sw t4, 44(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t5, 40(sp) # 4-byte Folded Spill
 ; RV32IZCMP-SR-NEXT:    sw t6, 36(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 32(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 32(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    lw t0, 92(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t1, 88(sp) # 4-byte Folded Reload
 ; RV32IZCMP-SR-NEXT:    lw t2, 84(sp) # 4-byte Folded Reload
@@ -2185,84 +2185,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64IZCMP-SR-NEXT:    sd t4, 72(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t5, 64(sp) # 8-byte Folded Spill
 ; RV64IZCMP-SR-NEXT:    sd t6, 56(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    ld t0, 168(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t1, 160(sp) # 8-byte Folded Reload
 ; RV64IZCMP-SR-NEXT:    ld t2, 152(sp) # 8-byte Folded Reload
@@ -2313,84 +2313,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV32I-NEXT:    sw t4, 40(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t5, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw t6, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw t1, 132(sp) # 4-byte Folded Reload
@@ -2453,84 +2453,84 @@ define void @callee_with_irq() nounwind "interrupt"="user" {
 ; RV64I-NEXT:    sd t4, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t5, 56(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd t6, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 264(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t0, 256(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld t1, 248(sp) # 8-byte Folded Reload
@@ -2570,333 +2570,333 @@ define void @callee_no_irq() nounwind{
 ; RV32IZCMP-LABEL: callee_no_irq:
 ; RV32IZCMP:       # %bb.0:
 ; RV32IZCMP-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-LABEL: callee_no_irq:
 ; RV64IZCMP:       # %bb.0:
 ; RV64IZCMP-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32IZCMP-SR-LABEL: callee_no_irq:
 ; RV32IZCMP-SR:       # %bb.0:
 ; RV32IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -96
-; RV32IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV32IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV32IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV32IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV32IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV32IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV32IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV32IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV32IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV32IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV32IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV32IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV32IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV32IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV32IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV32IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV32IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV32IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV32IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV32IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV32IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV32IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV32IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV32IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV32IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV32IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV32IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV32IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV32IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV32IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV32IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV32IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV32IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV32IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV32IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV32IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV32IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV32IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV32IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV32IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV32IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV32IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV32IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV32IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV32IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV32IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV32IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV32IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV32IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV32IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV32IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV32IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV32IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV32IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV32IZCMP-SR-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV32IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV32IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV32IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV32IZCMP-SR-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
-; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV32IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 96
 ;
 ; RV64IZCMP-SR-LABEL: callee_no_irq:
 ; RV64IZCMP-SR:       # %bb.0:
 ; RV64IZCMP-SR-NEXT:    cm.push {ra, s0-s11}, -160
-; RV64IZCMP-SR-NEXT:    lui a6, %hi(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    lui a5, %hi(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    addi a2, a5, %lo(var_test_irq)
+; RV64IZCMP-SR-NEXT:    lw a0, 16(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    lw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64IZCMP-SR-NEXT:    lw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    lw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
-; RV64IZCMP-SR-NEXT:    lw t4, 24(a5)
-; RV64IZCMP-SR-NEXT:    lw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    lw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    lw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    lw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    lw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    lw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    lw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    lw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    lw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    lw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    lw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    lw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    lw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    lw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    lw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    lw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    lw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    lw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    lw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    lw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    lw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    lw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    lw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    lw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    lw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a0, 124(a5)
-; RV64IZCMP-SR-NEXT:    sw a1, 120(a5)
-; RV64IZCMP-SR-NEXT:    sw a2, 116(a5)
-; RV64IZCMP-SR-NEXT:    sw a3, 112(a5)
-; RV64IZCMP-SR-NEXT:    sw a4, 108(a5)
-; RV64IZCMP-SR-NEXT:    sw a7, 104(a5)
-; RV64IZCMP-SR-NEXT:    sw s0, 100(a5)
-; RV64IZCMP-SR-NEXT:    sw t0, 96(a5)
-; RV64IZCMP-SR-NEXT:    sw t1, 92(a5)
-; RV64IZCMP-SR-NEXT:    sw t2, 88(a5)
-; RV64IZCMP-SR-NEXT:    sw t3, 84(a5)
-; RV64IZCMP-SR-NEXT:    sw s1, 80(a5)
-; RV64IZCMP-SR-NEXT:    sw ra, 76(a5)
-; RV64IZCMP-SR-NEXT:    sw s11, 72(a5)
-; RV64IZCMP-SR-NEXT:    sw s10, 68(a5)
-; RV64IZCMP-SR-NEXT:    sw s9, 64(a5)
-; RV64IZCMP-SR-NEXT:    sw s8, 60(a5)
-; RV64IZCMP-SR-NEXT:    sw s7, 56(a5)
-; RV64IZCMP-SR-NEXT:    sw s6, 52(a5)
-; RV64IZCMP-SR-NEXT:    sw s5, 48(a5)
-; RV64IZCMP-SR-NEXT:    sw s4, 44(a5)
-; RV64IZCMP-SR-NEXT:    sw s3, 40(a5)
-; RV64IZCMP-SR-NEXT:    sw s2, 36(a5)
-; RV64IZCMP-SR-NEXT:    sw t6, 32(a5)
-; RV64IZCMP-SR-NEXT:    sw t5, 28(a5)
-; RV64IZCMP-SR-NEXT:    sw t4, 24(a5)
+; RV64IZCMP-SR-NEXT:    lw t4, 36(a2)
+; RV64IZCMP-SR-NEXT:    lw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    lw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    lw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    lw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    lw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    lw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    lw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    lw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    lw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    lw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    lw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    lw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    lw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    lw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    lw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    lw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    lw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    lw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    lw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    lw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    lw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    lw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    lw t3, %lo(var_test_irq+4)(a5)
+; RV64IZCMP-SR-NEXT:    lw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    lw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 124(a2)
+; RV64IZCMP-SR-NEXT:    sw a1, 120(a2)
+; RV64IZCMP-SR-NEXT:    sw a3, 116(a2)
+; RV64IZCMP-SR-NEXT:    sw s0, 112(a2)
+; RV64IZCMP-SR-NEXT:    sw a4, 108(a2)
+; RV64IZCMP-SR-NEXT:    sw a6, 104(a2)
+; RV64IZCMP-SR-NEXT:    sw a7, 100(a2)
+; RV64IZCMP-SR-NEXT:    sw t0, 96(a2)
+; RV64IZCMP-SR-NEXT:    sw s1, 92(a2)
+; RV64IZCMP-SR-NEXT:    sw ra, 88(a2)
+; RV64IZCMP-SR-NEXT:    sw s11, 84(a2)
+; RV64IZCMP-SR-NEXT:    sw s10, 80(a2)
+; RV64IZCMP-SR-NEXT:    sw s9, 76(a2)
+; RV64IZCMP-SR-NEXT:    sw s8, 72(a2)
+; RV64IZCMP-SR-NEXT:    sw s7, 68(a2)
+; RV64IZCMP-SR-NEXT:    sw s6, 64(a2)
+; RV64IZCMP-SR-NEXT:    sw s5, 60(a2)
+; RV64IZCMP-SR-NEXT:    sw s4, 56(a2)
+; RV64IZCMP-SR-NEXT:    sw s3, 52(a2)
+; RV64IZCMP-SR-NEXT:    sw s2, 48(a2)
+; RV64IZCMP-SR-NEXT:    sw t6, 44(a2)
+; RV64IZCMP-SR-NEXT:    sw t5, 40(a2)
+; RV64IZCMP-SR-NEXT:    sw t4, 36(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 20(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 32(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, 16(a5)
+; RV64IZCMP-SR-NEXT:    sw a0, 28(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 24(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 20(a2)
 ; RV64IZCMP-SR-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, 16(a2)
+; RV64IZCMP-SR-NEXT:    sw t1, %lo(var_test_irq+12)(a5)
+; RV64IZCMP-SR-NEXT:    sw t2, %lo(var_test_irq+8)(a5)
+; RV64IZCMP-SR-NEXT:    sw t3, %lo(var_test_irq+4)(a5)
 ; RV64IZCMP-SR-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64IZCMP-SR-NEXT:    sw a0, %lo(var_test_irq)(a5)
 ; RV64IZCMP-SR-NEXT:    cm.popret {ra, s0-s11}, 160
 ;
 ; RV32I-LABEL: callee_no_irq:
@@ -2915,84 +2915,84 @@ define void @callee_no_irq() nounwind{
 ; RV32I-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a6, %hi(var_test_irq)
-; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    lui a4, %hi(var_test_irq)
+; RV32I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV32I-NEXT:    lw a0, 16(a2)
 ; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    lw a0, 20(a2)
 ; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    lw a0, 24(a2)
 ; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV32I-NEXT:    lw a0, 16(a5)
+; RV32I-NEXT:    lw a0, 28(a2)
 ; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw a0, 20(a5)
+; RV32I-NEXT:    lw a0, 32(a2)
 ; RV32I-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw t0, 24(a5)
-; RV32I-NEXT:    lw t1, 28(a5)
-; RV32I-NEXT:    lw t2, 32(a5)
-; RV32I-NEXT:    lw t3, 36(a5)
-; RV32I-NEXT:    lw t4, 40(a5)
-; RV32I-NEXT:    lw t5, 44(a5)
-; RV32I-NEXT:    lw t6, 48(a5)
-; RV32I-NEXT:    lw s0, 52(a5)
-; RV32I-NEXT:    lw s1, 56(a5)
-; RV32I-NEXT:    lw s2, 60(a5)
-; RV32I-NEXT:    lw s3, 64(a5)
-; RV32I-NEXT:    lw s4, 68(a5)
-; RV32I-NEXT:    lw s5, 72(a5)
-; RV32I-NEXT:    lw s6, 76(a5)
-; RV32I-NEXT:    lw s7, 80(a5)
-; RV32I-NEXT:    lw s8, 84(a5)
-; RV32I-NEXT:    lw s9, 88(a5)
-; RV32I-NEXT:    lw s10, 92(a5)
-; RV32I-NEXT:    lw s11, 96(a5)
-; RV32I-NEXT:    lw ra, 100(a5)
-; RV32I-NEXT:    lw a7, 104(a5)
-; RV32I-NEXT:    lw a4, 108(a5)
-; RV32I-NEXT:    lw a0, 124(a5)
-; RV32I-NEXT:    lw a1, 120(a5)
-; RV32I-NEXT:    lw a2, 116(a5)
-; RV32I-NEXT:    lw a3, 112(a5)
-; RV32I-NEXT:    sw a0, 124(a5)
-; RV32I-NEXT:    sw a1, 120(a5)
-; RV32I-NEXT:    sw a2, 116(a5)
-; RV32I-NEXT:    sw a3, 112(a5)
-; RV32I-NEXT:    sw a4, 108(a5)
-; RV32I-NEXT:    sw a7, 104(a5)
-; RV32I-NEXT:    sw ra, 100(a5)
-; RV32I-NEXT:    sw s11, 96(a5)
-; RV32I-NEXT:    sw s10, 92(a5)
-; RV32I-NEXT:    sw s9, 88(a5)
-; RV32I-NEXT:    sw s8, 84(a5)
-; RV32I-NEXT:    sw s7, 80(a5)
-; RV32I-NEXT:    sw s6, 76(a5)
-; RV32I-NEXT:    sw s5, 72(a5)
-; RV32I-NEXT:    sw s4, 68(a5)
-; RV32I-NEXT:    sw s3, 64(a5)
-; RV32I-NEXT:    sw s2, 60(a5)
-; RV32I-NEXT:    sw s1, 56(a5)
-; RV32I-NEXT:    sw s0, 52(a5)
-; RV32I-NEXT:    sw t6, 48(a5)
-; RV32I-NEXT:    sw t5, 44(a5)
-; RV32I-NEXT:    sw t4, 40(a5)
-; RV32I-NEXT:    sw t3, 36(a5)
-; RV32I-NEXT:    sw t2, 32(a5)
-; RV32I-NEXT:    sw t1, 28(a5)
-; RV32I-NEXT:    sw t0, 24(a5)
+; RV32I-NEXT:    lw t0, 36(a2)
+; RV32I-NEXT:    lw t1, 40(a2)
+; RV32I-NEXT:    lw t2, 44(a2)
+; RV32I-NEXT:    lw t3, 48(a2)
+; RV32I-NEXT:    lw t4, 52(a2)
+; RV32I-NEXT:    lw t5, 56(a2)
+; RV32I-NEXT:    lw t6, 60(a2)
+; RV32I-NEXT:    lw s0, 64(a2)
+; RV32I-NEXT:    lw s1, 68(a2)
+; RV32I-NEXT:    lw s2, 72(a2)
+; RV32I-NEXT:    lw s3, 76(a2)
+; RV32I-NEXT:    lw s4, 80(a2)
+; RV32I-NEXT:    lw s5, 84(a2)
+; RV32I-NEXT:    lw s6, 88(a2)
+; RV32I-NEXT:    lw s7, 92(a2)
+; RV32I-NEXT:    lw s8, 96(a2)
+; RV32I-NEXT:    lw s9, 100(a2)
+; RV32I-NEXT:    lw s10, 104(a2)
+; RV32I-NEXT:    lw s11, 108(a2)
+; RV32I-NEXT:    lw ra, 112(a2)
+; RV32I-NEXT:    lw a3, 116(a2)
+; RV32I-NEXT:    lw a1, 120(a2)
+; RV32I-NEXT:    lw a0, 124(a2)
+; RV32I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV32I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a0, 124(a2)
+; RV32I-NEXT:    sw a1, 120(a2)
+; RV32I-NEXT:    sw a3, 116(a2)
+; RV32I-NEXT:    sw ra, 112(a2)
+; RV32I-NEXT:    sw s11, 108(a2)
+; RV32I-NEXT:    sw s10, 104(a2)
+; RV32I-NEXT:    sw s9, 100(a2)
+; RV32I-NEXT:    sw s8, 96(a2)
+; RV32I-NEXT:    sw s7, 92(a2)
+; RV32I-NEXT:    sw s6, 88(a2)
+; RV32I-NEXT:    sw s5, 84(a2)
+; RV32I-NEXT:    sw s4, 80(a2)
+; RV32I-NEXT:    sw s3, 76(a2)
+; RV32I-NEXT:    sw s2, 72(a2)
+; RV32I-NEXT:    sw s1, 68(a2)
+; RV32I-NEXT:    sw s0, 64(a2)
+; RV32I-NEXT:    sw t6, 60(a2)
+; RV32I-NEXT:    sw t5, 56(a2)
+; RV32I-NEXT:    sw t4, 52(a2)
+; RV32I-NEXT:    sw t3, 48(a2)
+; RV32I-NEXT:    sw t2, 44(a2)
+; RV32I-NEXT:    sw t1, 40(a2)
+; RV32I-NEXT:    sw t0, 36(a2)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 20(a5)
+; RV32I-NEXT:    sw a0, 32(a2)
 ; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, 16(a5)
+; RV32I-NEXT:    sw a0, 28(a2)
 ; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV32I-NEXT:    sw a0, 24(a2)
 ; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV32I-NEXT:    sw a0, 20(a2)
 ; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV32I-NEXT:    sw a0, 16(a2)
+; RV32I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV32I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV32I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV32I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
@@ -3025,84 +3025,84 @@ define void @callee_no_irq() nounwind{
 ; RV64I-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a6, %hi(var_test_irq)
-; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    lui a4, %hi(var_test_irq)
+; RV64I-NEXT:    lw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    addi a2, a4, %lo(var_test_irq)
+; RV64I-NEXT:    lw a0, 16(a2)
 ; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    lw a0, 20(a2)
 ; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    lw a0, 24(a2)
 ; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a5, a6, %lo(var_test_irq)
-; RV64I-NEXT:    lw a0, 16(a5)
+; RV64I-NEXT:    lw a0, 28(a2)
 ; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw a0, 20(a5)
+; RV64I-NEXT:    lw a0, 32(a2)
 ; RV64I-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lw t0, 24(a5)
-; RV64I-NEXT:    lw t1, 28(a5)
-; RV64I-NEXT:    lw t2, 32(a5)
-; RV64I-NEXT:    lw t3, 36(a5)
-; RV64I-NEXT:    lw t4, 40(a5)
-; RV64I-NEXT:    lw t5, 44(a5)
-; RV64I-NEXT:    lw t6, 48(a5)
-; RV64I-NEXT:    lw s0, 52(a5)
-; RV64I-NEXT:    lw s1, 56(a5)
-; RV64I-NEXT:    lw s2, 60(a5)
-; RV64I-NEXT:    lw s3, 64(a5)
-; RV64I-NEXT:    lw s4, 68(a5)
-; RV64I-NEXT:    lw s5, 72(a5)
-; RV64I-NEXT:    lw s6, 76(a5)
-; RV64I-NEXT:    lw s7, 80(a5)
-; RV64I-NEXT:    lw s8, 84(a5)
-; RV64I-NEXT:    lw s9, 88(a5)
-; RV64I-NEXT:    lw s10, 92(a5)
-; RV64I-NEXT:    lw s11, 96(a5)
-; RV64I-NEXT:    lw ra, 100(a5)
-; RV64I-NEXT:    lw a7, 104(a5)
-; RV64I-NEXT:    lw a4, 108(a5)
-; RV64I-NEXT:    lw a0, 124(a5)
-; RV64I-NEXT:    lw a1, 120(a5)
-; RV64I-NEXT:    lw a2, 116(a5)
-; RV64I-NEXT:    lw a3, 112(a5)
-; RV64I-NEXT:    sw a0, 124(a5)
-; RV64I-NEXT:    sw a1, 120(a5)
-; RV64I-NEXT:    sw a2, 116(a5)
-; RV64I-NEXT:    sw a3, 112(a5)
-; RV64I-NEXT:    sw a4, 108(a5)
-; RV64I-NEXT:    sw a7, 104(a5)
-; RV64I-NEXT:    sw ra, 100(a5)
-; RV64I-NEXT:    sw s11, 96(a5)
-; RV64I-NEXT:    sw s10, 92(a5)
-; RV64I-NEXT:    sw s9, 88(a5)
-; RV64I-NEXT:    sw s8, 84(a5)
-; RV64I-NEXT:    sw s7, 80(a5)
-; RV64I-NEXT:    sw s6, 76(a5)
-; RV64I-NEXT:    sw s5, 72(a5)
-; RV64I-NEXT:    sw s4, 68(a5)
-; RV64I-NEXT:    sw s3, 64(a5)
-; RV64I-NEXT:    sw s2, 60(a5)
-; RV64I-NEXT:    sw s1, 56(a5)
-; RV64I-NEXT:    sw s0, 52(a5)
-; RV64I-NEXT:    sw t6, 48(a5)
-; RV64I-NEXT:    sw t5, 44(a5)
-; RV64I-NEXT:    sw t4, 40(a5)
-; RV64I-NEXT:    sw t3, 36(a5)
-; RV64I-NEXT:    sw t2, 32(a5)
-; RV64I-NEXT:    sw t1, 28(a5)
-; RV64I-NEXT:    sw t0, 24(a5)
+; RV64I-NEXT:    lw t0, 36(a2)
+; RV64I-NEXT:    lw t1, 40(a2)
+; RV64I-NEXT:    lw t2, 44(a2)
+; RV64I-NEXT:    lw t3, 48(a2)
+; RV64I-NEXT:    lw t4, 52(a2)
+; RV64I-NEXT:    lw t5, 56(a2)
+; RV64I-NEXT:    lw t6, 60(a2)
+; RV64I-NEXT:    lw s0, 64(a2)
+; RV64I-NEXT:    lw s1, 68(a2)
+; RV64I-NEXT:    lw s2, 72(a2)
+; RV64I-NEXT:    lw s3, 76(a2)
+; RV64I-NEXT:    lw s4, 80(a2)
+; RV64I-NEXT:    lw s5, 84(a2)
+; RV64I-NEXT:    lw s6, 88(a2)
+; RV64I-NEXT:    lw s7, 92(a2)
+; RV64I-NEXT:    lw s8, 96(a2)
+; RV64I-NEXT:    lw s9, 100(a2)
+; RV64I-NEXT:    lw s10, 104(a2)
+; RV64I-NEXT:    lw s11, 108(a2)
+; RV64I-NEXT:    lw ra, 112(a2)
+; RV64I-NEXT:    lw a3, 116(a2)
+; RV64I-NEXT:    lw a1, 120(a2)
+; RV64I-NEXT:    lw a0, 124(a2)
+; RV64I-NEXT:    lw a7, %lo(var_test_irq+4)(a4)
+; RV64I-NEXT:    lw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    lw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a0, 124(a2)
+; RV64I-NEXT:    sw a1, 120(a2)
+; RV64I-NEXT:    sw a3, 116(a2)
+; RV64I-NEXT:    sw ra, 112(a2)
+; RV64I-NEXT:    sw s11, 108(a2)
+; RV64I-NEXT:    sw s10, 104(a2)
+; RV64I-NEXT:    sw s9, 100(a2)
+; RV64I-NEXT:    sw s8, 96(a2)
+; RV64I-NEXT:    sw s7, 92(a2)
+; RV64I-NEXT:    sw s6, 88(a2)
+; RV64I-NEXT:    sw s5, 84(a2)
+; RV64I-NEXT:    sw s4, 80(a2)
+; RV64I-NEXT:    sw s3, 76(a2)
+; RV64I-NEXT:    sw s2, 72(a2)
+; RV64I-NEXT:    sw s1, 68(a2)
+; RV64I-NEXT:    sw s0, 64(a2)
+; RV64I-NEXT:    sw t6, 60(a2)
+; RV64I-NEXT:    sw t5, 56(a2)
+; RV64I-NEXT:    sw t4, 52(a2)
+; RV64I-NEXT:    sw t3, 48(a2)
+; RV64I-NEXT:    sw t2, 44(a2)
+; RV64I-NEXT:    sw t1, 40(a2)
+; RV64I-NEXT:    sw t0, 36(a2)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 20(a5)
+; RV64I-NEXT:    sw a0, 32(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, 16(a5)
+; RV64I-NEXT:    sw a0, 28(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+12)(a6)
+; RV64I-NEXT:    sw a0, 24(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+8)(a6)
+; RV64I-NEXT:    sw a0, 20(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq+4)(a6)
+; RV64I-NEXT:    sw a0, 16(a2)
+; RV64I-NEXT:    sw a5, %lo(var_test_irq+12)(a4)
+; RV64I-NEXT:    sw a6, %lo(var_test_irq+8)(a4)
+; RV64I-NEXT:    sw a7, %lo(var_test_irq+4)(a4)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a6)
+; RV64I-NEXT:    sw a0, %lo(var_test_irq)(a4)
 ; RV64I-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 136(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index b2dea4237f5a5a0..6a605b2cc53aeaf 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -8,24 +8,24 @@
 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_sum_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_sum_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 24(a0)
-; RV64-NEXT:    lw a2, 8(a0)
-; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    lw a1, 8(a0)
+; RV64-NEXT:    lw a2, 0(a0)
+; RV64-NEXT:    lw a3, 24(a0)
 ; RV64-NEXT:    lw a0, 16(a0)
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    addw a0, a2, a0
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -40,24 +40,24 @@ define i32 @reduce_sum_4xi32(<4 x i32> %v) {
 define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_xor_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    xor a2, a3, a2
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    xor a1, a2, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_xor_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a1, 8(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 24(a0)
 ; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    xor a2, a3, a2
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    xor a1, a2, a1
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
@@ -72,24 +72,24 @@ define i32 @reduce_xor_4xi32(<4 x i32> %v) {
 define i32 @reduce_or_4xi32(<4 x i32> %v) {
 ; RV32-LABEL: reduce_or_4xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 12(a0)
-; RV32-NEXT:    lw a2, 4(a0)
-; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a0, 8(a0)
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    or a1, a2, a1
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: reduce_or_4xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    ld a1, 24(a0)
-; RV64-NEXT:    ld a2, 8(a0)
-; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a1, 8(a0)
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a3, 24(a0)
 ; RV64-NEXT:    ld a0, 16(a0)
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    or a1, a2, a1
+; RV64-NEXT:    or a0, a0, a3
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
   %e0 = extractelement <4 x i32> %v, i32 0
   %e1 = extractelement <4 x i32> %v, i32 1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 5f9ca503bcb053c..56fe3340c83e7c0 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -749,22 +749,22 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw s5, 12(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a0, 4(a1)
 ; RV32I-NEXT:    lw s2, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
 ; RV32I-NEXT:    lw s6, 0(a1)
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi s3, a2, 1365
-; RV32I-NEXT:    and a1, a1, s3
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi s4, a1, 819
-; RV32I-NEXT:    and a1, a0, s4
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, s4
-; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a0, a2, 1
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    addi s3, a1, 1365
+; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    sub a2, a2, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi s4, a0, 819
+; RV32I-NEXT:    and a0, a2, s4
+; RV32I-NEXT:    srli a2, a2, 2
+; RV32I-NEXT:    and a1, a2, s4
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    lui a1, 61681
@@ -835,20 +835,20 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 4(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a1, 12(a1)
-; RV32ZBB-NEXT:    cpop a2, a2
+; RV32ZBB-NEXT:    lw a2, 12(a1)
+; RV32ZBB-NEXT:    lw a3, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    cpop a4, a4
+; RV32ZBB-NEXT:    add a3, a4, a3
+; RV32ZBB-NEXT:    cpop a2, a2
 ; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a3, a4
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    sw zero, 12(a0)
 ; RV32ZBB-NEXT:    sw zero, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 0(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %1
diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index f38aa71fb158d00..6c4466796aeeddf 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -177,12 +177,12 @@ define i8 @test13(ptr %0, i64 %1) {
 ; RV64I-NEXT:    li a2, 1
 ; RV64I-NEXT:    subw a2, a2, a1
 ; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    lbu a2, 0(a2)
 ; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    subw a3, a3, a1
 ; RV64I-NEXT:    add a0, a0, a3
+; RV64I-NEXT:    lbu a1, 0(a2)
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
   %3 = mul i64 %1, -4294967296
   %4 = add i64 %3, 4294967296 ; 1 << 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index d34c10798f48219..f6d1d3882e5e8fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -8,13 +8,13 @@ declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-LABEL: vpreduce_add_v4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a4, 4(a1)
-; RV32-NEXT:    lw a5, 12(a1)
-; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    lw a4, 12(a1)
+; RV32-NEXT:    lw a5, 8(a1)
+; RV32-NEXT:    lw a6, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
 ; RV32-NEXT:    lw a7, 0(a2)
-; RV32-NEXT:    lw t0, 8(a2)
-; RV32-NEXT:    lw t1, 12(a2)
+; RV32-NEXT:    lw t0, 12(a2)
+; RV32-NEXT:    lw t1, 8(a2)
 ; RV32-NEXT:    lw a2, 4(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
@@ -24,32 +24,32 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    sltiu a3, a3, 2
 ; RV32-NEXT:    xori a3, a3, 1
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    and a3, t4, t1
-; RV32-NEXT:    and t0, t3, t0
+; RV32-NEXT:    and a3, t4, t0
+; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
 ; RV32-NEXT:    and a1, a7, a1
 ; RV32-NEXT:    neg a7, t0
-; RV32-NEXT:    and a6, a7, a6
+; RV32-NEXT:    and a5, a7, a5
 ; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a3, a3, a5
+; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    and a2, a2, a4
+; RV32-NEXT:    and a2, a2, a6
 ; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a1, a1, a6
+; RV32-NEXT:    add a1, a1, a5
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpreduce_add_v4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a4, 8(a1)
-; RV64-NEXT:    lw a5, 24(a1)
-; RV64-NEXT:    lw a6, 16(a1)
+; RV64-NEXT:    lw a4, 24(a1)
+; RV64-NEXT:    lw a5, 16(a1)
+; RV64-NEXT:    lw a6, 8(a1)
 ; RV64-NEXT:    lw a1, 0(a1)
 ; RV64-NEXT:    ld a7, 0(a2)
-; RV64-NEXT:    ld t0, 16(a2)
-; RV64-NEXT:    ld t1, 24(a2)
+; RV64-NEXT:    ld t0, 24(a2)
+; RV64-NEXT:    ld t1, 16(a2)
 ; RV64-NEXT:    ld a2, 8(a2)
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
@@ -60,19 +60,19 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    sltiu a3, a3, 2
 ; RV64-NEXT:    xori a3, a3, 1
 ; RV64-NEXT:    and a2, a3, a2
-; RV64-NEXT:    and a3, t4, t1
-; RV64-NEXT:    and t0, t3, t0
+; RV64-NEXT:    and a3, t4, t0
+; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
 ; RV64-NEXT:    and a1, a7, a1
 ; RV64-NEXT:    negw a7, t0
-; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    and a5, a7, a5
 ; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    and a3, a3, a5
+; RV64-NEXT:    and a3, a3, a4
 ; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    and a2, a2, a4
+; RV64-NEXT:    and a2, a2, a6
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a1, a1, a6
+; RV64-NEXT:    add a1, a1, a5
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 8ed19ddb1af5cf7..24a7655dc35a1c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -26,26 +26,26 @@ define void @add_v4i32(ptr %x, ptr %y) {
 define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v2i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a2, 8(a0)
-; RV32-NEXT:    lw a3, 12(a0)
-; RV32-NEXT:    lw a4, 0(a0)
-; RV32-NEXT:    lw a5, 4(a0)
+; RV32-NEXT:    lw a2, 12(a0)
+; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a4, 4(a0)
+; RV32-NEXT:    lw a5, 12(a1)
 ; RV32-NEXT:    lw a6, 4(a1)
 ; RV32-NEXT:    lw a7, 0(a1)
-; RV32-NEXT:    lw t0, 8(a1)
-; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a7, a4, a7
-; RV32-NEXT:    sltu a4, a7, a4
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add t0, a2, t0
-; RV32-NEXT:    sltu a2, t0, a2
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    sw t0, 8(a0)
+; RV32-NEXT:    lw t0, 8(a0)
+; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    add a7, a3, a7
+; RV32-NEXT:    sltu a3, a7, a3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    add a1, t0, a1
+; RV32-NEXT:    sltu a4, a1, t0
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    sw a1, 8(a0)
 ; RV32-NEXT:    sw a7, 0(a0)
-; RV32-NEXT:    sw a1, 12(a0)
-; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc98..799e20074b04273 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,18 +7,18 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 80(a0)
-; ZVE32X-NEXT:    ld a2, 72(a0)
-; ZVE32X-NEXT:    ld a3, 56(a0)
-; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 24(a0)
+; ZVE32X-NEXT:    ld a1, 32(a0)
+; ZVE32X-NEXT:    ld a2, 24(a0)
+; ZVE32X-NEXT:    ld a3, 80(a0)
+; ZVE32X-NEXT:    ld a4, 72(a0)
+; ZVE32X-NEXT:    ld a5, 56(a0)
 ; ZVE32X-NEXT:    ld a6, 48(a0)
 ; ZVE32X-NEXT:    ld a7, 8(a0)
 ; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a4, a5, a4
-; ZVE32X-NEXT:    snez a4, a4
+; ZVE32X-NEXT:    xor a1, a2, a1
+; ZVE32X-NEXT:    snez a1, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a4
+; ZVE32X-NEXT:    vmv.s.x v8, a1
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
@@ -36,7 +36,7 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a0, a6, a3
+; ZVE32X-NEXT:    xor a0, a6, a5
 ; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v11, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
@@ -48,8 +48,8 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v9, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a0, a1
+; ZVE32X-NEXT:    xor a3, a4, a3
+; ZVE32X-NEXT:    snez a0, a3
 ; ZVE32X-NEXT:    vmv.s.x v10, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index d74fd6cd3f03470..221f9a005bc239f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -3966,22 +3966,22 @@ define <4 x i64> @mgather_falsemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru)
 ;
 ; RV32ZVE32F-LABEL: mgather_falsemask_v4i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lw a2, 0(a1)
-; RV32ZVE32F-NEXT:    lw a3, 4(a1)
-; RV32ZVE32F-NEXT:    lw a4, 8(a1)
-; RV32ZVE32F-NEXT:    lw a5, 12(a1)
-; RV32ZVE32F-NEXT:    lw a6, 28(a1)
-; RV32ZVE32F-NEXT:    lw a7, 24(a1)
-; RV32ZVE32F-NEXT:    lw t0, 20(a1)
+; RV32ZVE32F-NEXT:    lw a2, 20(a1)
+; RV32ZVE32F-NEXT:    lw a3, 24(a1)
+; RV32ZVE32F-NEXT:    lw a4, 28(a1)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    lw a6, 4(a1)
+; RV32ZVE32F-NEXT:    lw a7, 8(a1)
+; RV32ZVE32F-NEXT:    lw t0, 12(a1)
 ; RV32ZVE32F-NEXT:    lw a1, 16(a1)
-; RV32ZVE32F-NEXT:    sw a6, 28(a0)
-; RV32ZVE32F-NEXT:    sw a7, 24(a0)
-; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a4, 28(a0)
+; RV32ZVE32F-NEXT:    sw a3, 24(a0)
+; RV32ZVE32F-NEXT:    sw a2, 20(a0)
 ; RV32ZVE32F-NEXT:    sw a1, 16(a0)
-; RV32ZVE32F-NEXT:    sw a5, 12(a0)
-; RV32ZVE32F-NEXT:    sw a4, 8(a0)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t0, 12(a0)
+; RV32ZVE32F-NEXT:    sw a7, 8(a0)
+; RV32ZVE32F-NEXT:    sw a6, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_falsemask_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 846295b3ead27dc..534f80a302229b2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -753,18 +753,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    ld a6, 8(a1)
-; ZVE32F-NEXT:    ld a7, 0(a1)
-; ZVE32F-NEXT:    ld t0, 24(a1)
-; ZVE32F-NEXT:    ld t1, 16(a1)
+; ZVE32F-NEXT:    ld a6, 24(a1)
+; ZVE32F-NEXT:    ld a7, 16(a1)
+; ZVE32F-NEXT:    ld t0, 8(a1)
+; ZVE32F-NEXT:    ld t1, 0(a1)
 ; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    add t3, a0, t3
-; ZVE32F-NEXT:    sd a7, 0(t3)
-; ZVE32F-NEXT:    sd a6, 0(t2)
-; ZVE32F-NEXT:    sd t1, 80(t3)
-; ZVE32F-NEXT:    sd t0, 80(t2)
+; ZVE32F-NEXT:    sd t1, 0(t3)
+; ZVE32F-NEXT:    sd t0, 0(t2)
+; ZVE32F-NEXT:    sd a7, 80(t3)
+; ZVE32F-NEXT:    sd a6, 80(t2)
 ; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a3, a3, 4
 ; ZVE32F-NEXT:    addi a4, a4, -4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 7497051027fa372..352e3a2df153987 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -364,21 +364,21 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -459,9 +459,9 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -470,7 +470,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -483,7 +483,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -499,7 +499,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -631,9 +631,9 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -642,7 +642,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -655,7 +655,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -671,7 +671,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -812,9 +812,9 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -823,7 +823,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -836,7 +836,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -852,7 +852,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -1267,37 +1267,37 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -1448,13 +1448,13 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -1463,7 +1463,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -1472,7 +1472,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -1481,7 +1481,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -1731,13 +1731,13 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -1746,7 +1746,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -1755,7 +1755,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -1764,7 +1764,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -2034,13 +2034,13 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -2049,7 +2049,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -2058,7 +2058,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -2067,7 +2067,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -3700,21 +3700,21 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -3795,9 +3795,9 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -3806,7 +3806,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -3819,7 +3819,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -3835,7 +3835,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -3965,9 +3965,9 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -3976,7 +3976,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -3989,7 +3989,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -4005,7 +4005,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4058,21 +4058,21 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
 ; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
 ; CHECK-NOV-NEXT:    .cfi_offset fs2, -64
-; CHECK-NOV-NEXT:    lhu s1, 24(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 16(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs2, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4145,9 +4145,9 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 24(a0)
-; CHECK-V-NEXT:    lhu s1, 16(a0)
-; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 24(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -4156,7 +4156,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
@@ -4169,7 +4169,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
@@ -4185,7 +4185,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -4588,37 +4588,37 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -4769,13 +4769,13 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -4784,7 +4784,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -4793,7 +4793,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -4802,7 +4802,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -5048,13 +5048,13 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -5063,7 +5063,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -5072,7 +5072,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -5081,7 +5081,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
@@ -5187,37 +5187,37 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset fs4, -112
 ; CHECK-NOV-NEXT:    .cfi_offset fs5, -120
 ; CHECK-NOV-NEXT:    .cfi_offset fs6, -128
-; CHECK-NOV-NEXT:    lhu s1, 56(a1)
-; CHECK-NOV-NEXT:    lhu s2, 0(a1)
-; CHECK-NOV-NEXT:    lhu s3, 8(a1)
-; CHECK-NOV-NEXT:    lhu s4, 16(a1)
-; CHECK-NOV-NEXT:    lhu s5, 24(a1)
-; CHECK-NOV-NEXT:    lhu s6, 32(a1)
-; CHECK-NOV-NEXT:    lhu s7, 40(a1)
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu s3, 16(a1)
+; CHECK-NOV-NEXT:    lhu s4, 24(a1)
+; CHECK-NOV-NEXT:    lhu s5, 32(a1)
+; CHECK-NOV-NEXT:    lhu s6, 40(a1)
+; CHECK-NOV-NEXT:    lhu s7, 56(a1)
 ; CHECK-NOV-NEXT:    lhu a1, 48(a1)
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs6, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs5, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s5
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs3, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s4
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs2, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs1, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
@@ -5350,13 +5350,13 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s4, 0(a0)
 ; CHECK-V-NEXT:    lhu s0, 56(a0)
 ; CHECK-V-NEXT:    lhu s1, 48(a0)
 ; CHECK-V-NEXT:    lhu s2, 40(a0)
 ; CHECK-V-NEXT:    lhu s3, 32(a0)
-; CHECK-V-NEXT:    lhu s4, 24(a0)
-; CHECK-V-NEXT:    lhu s5, 16(a0)
-; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu s5, 24(a0)
+; CHECK-V-NEXT:    lhu s6, 16(a0)
 ; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
@@ -5365,7 +5365,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
@@ -5374,7 +5374,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s5
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
@@ -5383,7 +5383,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2 at plt
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index 65dca0daed8c775..bf4dbe7ee14ffd6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -9,20 +9,20 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    lhu s0, 6(a0)
-; CHECK-NEXT:    lhu s1, 4(a0)
-; CHECK-NEXT:    lhu s2, 0(a0)
+; CHECK-NEXT:    lhu s0, 0(a0)
+; CHECK-NEXT:    lhu s1, 6(a0)
+; CHECK-NEXT:    lhu s2, 4(a0)
 ; CHECK-NEXT:    lhu a0, 2(a0)
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 8(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s2
+; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 0(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s1
+; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 12(sp)
-; CHECK-NEXT:    fmv.w.x fa0, s0
+; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2 at plt
 ; CHECK-NEXT:    fsw fa0, 4(sp)
 ; CHECK-NEXT:    addi a0, sp, 8
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 97121c275a29446..40adbbcd41fcdf6 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -151,12 +151,19 @@ define i64 @shl64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: lshr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 35(sp)
+; RV32I-NEXT:    sb zero, 34(sp)
+; RV32I-NEXT:    sb zero, 33(sp)
+; RV32I-NEXT:    sb zero, 32(sp)
 ; RV32I-NEXT:    sb zero, 31(sp)
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
@@ -169,94 +176,90 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sb zero, 22(sp)
 ; RV32I-NEXT:    sb zero, 21(sp)
 ; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb a1, 12(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    sb a4, 4(sp)
-; RV32I-NEXT:    sb a3, 0(sp)
+; RV32I-NEXT:    sb a1, 16(sp)
+; RV32I-NEXT:    sb a5, 12(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a3, 4(sp)
 ; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 15(sp)
+; RV32I-NEXT:    sb a6, 19(sp)
 ; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 14(sp)
+; RV32I-NEXT:    sb a6, 18(sp)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(sp)
+; RV32I-NEXT:    sb a1, 17(sp)
 ; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 11(sp)
+; RV32I-NEXT:    sb a1, 15(sp)
 ; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 10(sp)
+; RV32I-NEXT:    sb a1, 14(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(sp)
+; RV32I-NEXT:    sb a5, 13(sp)
 ; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(sp)
+; RV32I-NEXT:    sb a1, 11(sp)
 ; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(sp)
+; RV32I-NEXT:    sb a1, 10(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(sp)
+; RV32I-NEXT:    sb a4, 9(sp)
 ; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 3(sp)
+; RV32I-NEXT:    sb a1, 7(sp)
 ; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 2(sp)
+; RV32I-NEXT:    sb a1, 6(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(sp)
+; RV32I-NEXT:    sb a3, 5(sp)
 ; RV32I-NEXT:    slli a1, a2, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    addi a3, sp, 4
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a1)
+; RV32I-NEXT:    lbu t0, 5(a1)
+; RV32I-NEXT:    lbu t1, 6(a1)
+; RV32I-NEXT:    lbu t2, 7(a1)
+; RV32I-NEXT:    lbu t3, 8(a1)
+; RV32I-NEXT:    lbu t4, 9(a1)
+; RV32I-NEXT:    lbu t5, 10(a1)
+; RV32I-NEXT:    lbu t6, 11(a1)
+; RV32I-NEXT:    lbu s0, 12(a1)
+; RV32I-NEXT:    lbu s1, 13(a1)
+; RV32I-NEXT:    lbu s2, 14(a1)
+; RV32I-NEXT:    lbu a1, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, t0, a7
 ; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a5, t2, t1
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
 ; RV32I-NEXT:    xori a6, a2, 31
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a7, t6, t5
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
 ; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    srl a4, a4, a2
 ; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -265,7 +268,10 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: lshr128:
@@ -293,125 +299,131 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: ashr128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw a3, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a5, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a6, 4(a1)
+; RV32I-NEXT:    lw a1, 0(a2)
+; RV32I-NEXT:    sb a4, 16(sp)
 ; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    sb a6, 8(sp)
 ; RV32I-NEXT:    sb a5, 4(sp)
-; RV32I-NEXT:    sb a1, 0(sp)
-; RV32I-NEXT:    srai a6, a3, 31
-; RV32I-NEXT:    sb a6, 28(sp)
-; RV32I-NEXT:    sb a6, 24(sp)
-; RV32I-NEXT:    sb a6, 20(sp)
-; RV32I-NEXT:    sb a6, 16(sp)
-; RV32I-NEXT:    srli a7, a3, 24
-; RV32I-NEXT:    sb a7, 15(sp)
-; RV32I-NEXT:    srli a7, a3, 16
-; RV32I-NEXT:    sb a7, 14(sp)
+; RV32I-NEXT:    srai a2, a4, 31
+; RV32I-NEXT:    sb a2, 32(sp)
+; RV32I-NEXT:    sb a2, 28(sp)
+; RV32I-NEXT:    sb a2, 24(sp)
+; RV32I-NEXT:    sb a2, 20(sp)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 19(sp)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 18(sp)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(sp)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
 ; RV32I-NEXT:    sb a3, 13(sp)
-; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    srli a3, a6, 24
 ; RV32I-NEXT:    sb a3, 11(sp)
-; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    srli a3, a6, 16
 ; RV32I-NEXT:    sb a3, 10(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(sp)
+; RV32I-NEXT:    srli a3, a6, 8
+; RV32I-NEXT:    sb a3, 9(sp)
 ; RV32I-NEXT:    srli a3, a5, 24
 ; RV32I-NEXT:    sb a3, 7(sp)
 ; RV32I-NEXT:    srli a3, a5, 16
 ; RV32I-NEXT:    sb a3, 6(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 5(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(sp)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(sp)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    srli a3, a6, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a3, a2, 24
+; RV32I-NEXT:    sb a3, 35(sp)
+; RV32I-NEXT:    srli a4, a2, 16
+; RV32I-NEXT:    sb a4, 34(sp)
+; RV32I-NEXT:    srli a2, a2, 8
+; RV32I-NEXT:    sb a2, 33(sp)
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    sb a2, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a2, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a2, 21(sp)
+; RV32I-NEXT:    slli a2, a1, 25
+; RV32I-NEXT:    srli a2, a2, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a2, a3, a2
+; RV32I-NEXT:    lbu a3, 0(a2)
+; RV32I-NEXT:    lbu a4, 1(a2)
+; RV32I-NEXT:    lbu a5, 2(a2)
+; RV32I-NEXT:    lbu a6, 3(a2)
+; RV32I-NEXT:    lbu a7, 4(a2)
+; RV32I-NEXT:    lbu t0, 5(a2)
+; RV32I-NEXT:    lbu t1, 6(a2)
+; RV32I-NEXT:    lbu t2, 7(a2)
+; RV32I-NEXT:    lbu t3, 8(a2)
+; RV32I-NEXT:    lbu t4, 9(a2)
+; RV32I-NEXT:    lbu t5, 10(a2)
+; RV32I-NEXT:    lbu t6, 11(a2)
+; RV32I-NEXT:    lbu s0, 12(a2)
+; RV32I-NEXT:    lbu s1, 13(a2)
+; RV32I-NEXT:    lbu s2, 14(a2)
+; RV32I-NEXT:    lbu a2, 15(a2)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    andi a1, a1, 7
+; RV32I-NEXT:    srl a3, a3, a1
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a5, t2, t1
 ; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    xori a6, a1, 31
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a7, t6, t5
 ; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
+; RV32I-NEXT:    not t0, a1
 ; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    slli a7, a1, 1
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a2, a2, 24
+; RV32I-NEXT:    or a2, a2, s2
+; RV32I-NEXT:    or a2, a2, s0
+; RV32I-NEXT:    slli a7, a2, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    sra a1, a2, a1
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: ashr128:
@@ -439,12 +451,19 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: shl128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    addi sp, sp, -48
+; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sb zero, 19(sp)
+; RV32I-NEXT:    sb zero, 18(sp)
+; RV32I-NEXT:    sb zero, 17(sp)
+; RV32I-NEXT:    sb zero, 16(sp)
 ; RV32I-NEXT:    sb zero, 15(sp)
 ; RV32I-NEXT:    sb zero, 14(sp)
 ; RV32I-NEXT:    sb zero, 13(sp)
@@ -457,103 +476,102 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sb zero, 6(sp)
 ; RV32I-NEXT:    sb zero, 5(sp)
 ; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb zero, 3(sp)
-; RV32I-NEXT:    sb zero, 2(sp)
-; RV32I-NEXT:    sb zero, 1(sp)
-; RV32I-NEXT:    sb zero, 0(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a5, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a3, 16(sp)
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    sb a5, 28(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    sb a3, 20(sp)
 ; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 31(sp)
+; RV32I-NEXT:    sb a6, 35(sp)
 ; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 30(sp)
+; RV32I-NEXT:    sb a6, 34(sp)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    sb a1, 33(sp)
 ; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 27(sp)
+; RV32I-NEXT:    sb a1, 31(sp)
 ; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 26(sp)
+; RV32I-NEXT:    sb a1, 30(sp)
 ; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 25(sp)
+; RV32I-NEXT:    sb a5, 29(sp)
 ; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 23(sp)
+; RV32I-NEXT:    sb a1, 27(sp)
 ; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 22(sp)
+; RV32I-NEXT:    sb a1, 26(sp)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(sp)
+; RV32I-NEXT:    sb a4, 25(sp)
 ; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 19(sp)
+; RV32I-NEXT:    sb a1, 23(sp)
 ; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 18(sp)
+; RV32I-NEXT:    sb a1, 22(sp)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 17(sp)
+; RV32I-NEXT:    sb a3, 21(sp)
 ; RV32I-NEXT:    slli a1, a2, 25
 ; RV32I-NEXT:    srli a1, a1, 28
-; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a1, a3, a1
-; RV32I-NEXT:    lbu a3, 5(a1)
-; RV32I-NEXT:    lbu a4, 4(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    addi a3, sp, 20
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 4(a3)
+; RV32I-NEXT:    lbu a4, 5(a3)
+; RV32I-NEXT:    lbu a5, 6(a3)
+; RV32I-NEXT:    lbu a6, 7(a3)
+; RV32I-NEXT:    lbu a7, 8(a3)
+; RV32I-NEXT:    lbu t0, 9(a3)
+; RV32I-NEXT:    lbu t1, 10(a3)
+; RV32I-NEXT:    lbu t2, 11(a3)
+; RV32I-NEXT:    lbu t3, 12(a3)
+; RV32I-NEXT:    lbu t4, 13(a3)
+; RV32I-NEXT:    lbu t5, 14(a3)
+; RV32I-NEXT:    lbu t6, 15(a3)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu s0, 0(a3)
+; RV32I-NEXT:    lbu s1, 1(a3)
+; RV32I-NEXT:    lbu s2, 2(a3)
+; RV32I-NEXT:    lbu a3, 3(a3)
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    sll a4, a3, a2
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu t0, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    srli a6, a5, 1
-; RV32I-NEXT:    xori a7, a2, 31
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 9(a1)
-; RV32I-NEXT:    lbu t0, 8(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    sll a4, a1, a2
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    srli a5, a3, 1
+; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a5, t0, a7
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    sll t0, a6, a2
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not t1, a2
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    lbu t0, 13(a1)
-; RV32I-NEXT:    lbu t1, 12(a1)
-; RV32I-NEXT:    lbu t2, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    srli a6, a6, 1
-; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    sll a2, a5, a2
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    not a7, a2
+; RV32I-NEXT:    srl a1, a1, a7
+; RV32I-NEXT:    sll a7, a5, a2
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    sll a7, a7, a2
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    sll a2, a3, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 48
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: shl128:
@@ -616,60 +634,60 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 8(a1)
 ; RV32I-NEXT:    lw t2, 0(a1)
+; RV32I-NEXT:    lw a3, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    andi t1, a2, 64
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    mv a4, t2
+; RV32I-NEXT:    mv a1, t2
 ; RV32I-NEXT:    beqz t1, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv t0, a3
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:  .LBB10_2:
 ; RV32I-NEXT:    andi a6, a2, 32
-; RV32I-NEXT:    mv a5, a4
+; RV32I-NEXT:    mv a5, a1
 ; RV32I-NEXT:    bnez a6, .LBB10_13
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    bnez t1, .LBB10_14
 ; RV32I-NEXT:  .LBB10_4:
 ; RV32I-NEXT:    beqz a6, .LBB10_6
 ; RV32I-NEXT:  .LBB10_5:
-; RV32I-NEXT:    mv t0, a3
+; RV32I-NEXT:    mv t0, a4
 ; RV32I-NEXT:  .LBB10_6:
 ; RV32I-NEXT:    slli t3, t0, 1
 ; RV32I-NEXT:    not t2, a2
 ; RV32I-NEXT:    beqz t1, .LBB10_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv a1, a7
+; RV32I-NEXT:    mv a3, a7
 ; RV32I-NEXT:  .LBB10_8:
 ; RV32I-NEXT:    srl a7, a5, a2
 ; RV32I-NEXT:    sll t1, t3, t2
 ; RV32I-NEXT:    srl t0, t0, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a4, a3
 ; RV32I-NEXT:  .LBB10_10:
 ; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    slli t1, a3, 1
+; RV32I-NEXT:    slli t1, a4, 1
 ; RV32I-NEXT:    sll t1, t1, t2
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    srl a4, a4, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    slli a4, a1, 1
-; RV32I-NEXT:    sll a4, a4, t2
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    slli a1, a3, 1
+; RV32I-NEXT:    sll a1, a1, t2
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    srl a2, a3, a2
 ; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    sll a2, a5, t2
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sll a3, a5, t2
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    sw t0, 4(a0)
 ; RV32I-NEXT:    sw a7, 0(a0)
 ; RV32I-NEXT:    ret
@@ -677,7 +695,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    beqz t1, .LBB10_4
 ; RV32I-NEXT:  .LBB10_14:
-; RV32I-NEXT:    mv a3, t2
+; RV32I-NEXT:    mv a4, t2
 ; RV32I-NEXT:    bnez a6, .LBB10_5
 ; RV32I-NEXT:    j .LBB10_6
 ;
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 122388c1b73ec3e..fcaa1f7f238f6d8 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -310,22 +310,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lbu a0, 12(a0)
-; RV32-NEXT:    lw a1, 8(s0)
-; RV32-NEXT:    slli a2, a0, 30
+; RV32-NEXT:    lbu a1, 12(a0)
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    lw a3, 4(s0)
-; RV32-NEXT:    srli s1, a1, 2
-; RV32-NEXT:    or s1, s1, a2
-; RV32-NEXT:    slli a2, a1, 31
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    or s2, a4, a2
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    slli a0, a0, 31
-; RV32-NEXT:    srai s3, a0, 31
-; RV32-NEXT:    srli a1, a1, 1
-; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srli s1, a2, 2
+; RV32-NEXT:    or s1, s1, a0
+; RV32-NEXT:    slli a4, a2, 31
 ; RV32-NEXT:    lw a0, 0(s0)
-; RV32-NEXT:    srai s4, a1, 31
+; RV32-NEXT:    srli a5, a3, 1
+; RV32-NEXT:    or s2, a5, a4
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai s3, a1, 31
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a2, a2, 31
+; RV32-NEXT:    srai s4, a2, 31
 ; RV32-NEXT:    slli a1, a3, 31
 ; RV32-NEXT:    srai a1, a1, 31
 ; RV32-NEXT:    li a2, 6
@@ -462,22 +462,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32M-NEXT:    mv s0, a0
-; RV32M-NEXT:    lbu a0, 12(a0)
-; RV32M-NEXT:    lw a1, 8(s0)
-; RV32M-NEXT:    slli a2, a0, 30
+; RV32M-NEXT:    lbu a1, 12(a0)
+; RV32M-NEXT:    lw a2, 8(a0)
+; RV32M-NEXT:    slli a0, a1, 30
 ; RV32M-NEXT:    lw a3, 4(s0)
-; RV32M-NEXT:    srli s1, a1, 2
-; RV32M-NEXT:    or s1, s1, a2
-; RV32M-NEXT:    slli a2, a1, 31
-; RV32M-NEXT:    srli a4, a3, 1
-; RV32M-NEXT:    or s2, a4, a2
-; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    slli a0, a0, 31
-; RV32M-NEXT:    srai s3, a0, 31
-; RV32M-NEXT:    srli a1, a1, 1
-; RV32M-NEXT:    slli a1, a1, 31
+; RV32M-NEXT:    srli s1, a2, 2
+; RV32M-NEXT:    or s1, s1, a0
+; RV32M-NEXT:    slli a4, a2, 31
 ; RV32M-NEXT:    lw a0, 0(s0)
-; RV32M-NEXT:    srai s4, a1, 31
+; RV32M-NEXT:    srli a5, a3, 1
+; RV32M-NEXT:    or s2, a5, a4
+; RV32M-NEXT:    srli a1, a1, 2
+; RV32M-NEXT:    slli a1, a1, 31
+; RV32M-NEXT:    srai s3, a1, 31
+; RV32M-NEXT:    srli a2, a2, 1
+; RV32M-NEXT:    slli a2, a2, 31
+; RV32M-NEXT:    srai s4, a2, 31
 ; RV32M-NEXT:    slli a1, a3, 31
 ; RV32M-NEXT:    srai a1, a1, 31
 ; RV32M-NEXT:    li a2, 6
@@ -536,34 +536,34 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    ld a1, 0(a0)
 ; RV64M-NEXT:    lwu a2, 8(a0)
-; RV64M-NEXT:    srli a3, a1, 2
-; RV64M-NEXT:    lbu a4, 12(a0)
+; RV64M-NEXT:    lbu a3, 12(a0)
+; RV64M-NEXT:    srli a4, a1, 2
 ; RV64M-NEXT:    slli a5, a2, 62
-; RV64M-NEXT:    or a3, a5, a3
-; RV64M-NEXT:    srai a3, a3, 31
-; RV64M-NEXT:    slli a4, a4, 32
-; RV64M-NEXT:    or a2, a2, a4
+; RV64M-NEXT:    or a4, a5, a4
+; RV64M-NEXT:    srai a4, a4, 31
+; RV64M-NEXT:    slli a3, a3, 32
+; RV64M-NEXT:    or a2, a2, a3
 ; RV64M-NEXT:    slli a2, a2, 29
-; RV64M-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64M-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64M-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64M-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64M-NEXT:    srai a2, a2, 31
 ; RV64M-NEXT:    slli a1, a1, 31
 ; RV64M-NEXT:    srai a1, a1, 31
-; RV64M-NEXT:    mulh a4, a2, a4
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
+; RV64M-NEXT:    mulh a3, a2, a3
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
 ; RV64M-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64M-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    slli a4, a4, 2
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    mulh a4, a3, a5
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    srai a4, a4, 1
-; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    slli a5, a4, 3
-; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    slli a3, a3, 2
+; RV64M-NEXT:    add a2, a2, a3
+; RV64M-NEXT:    mulh a3, a4, a5
+; RV64M-NEXT:    srli a5, a3, 63
+; RV64M-NEXT:    srai a3, a3, 1
+; RV64M-NEXT:    add a3, a3, a5
+; RV64M-NEXT:    slli a5, a3, 3
+; RV64M-NEXT:    add a3, a4, a3
 ; RV64M-NEXT:    sub a3, a3, a5
 ; RV64M-NEXT:    addi a3, a3, -1
 ; RV64M-NEXT:    seqz a3, a3
@@ -612,22 +612,22 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
 ; RV32MV-NEXT:    mv s0, a0
-; RV32MV-NEXT:    lbu a0, 12(a0)
-; RV32MV-NEXT:    lw a1, 8(s0)
-; RV32MV-NEXT:    slli a2, a0, 30
+; RV32MV-NEXT:    lbu a1, 12(a0)
+; RV32MV-NEXT:    lw a2, 8(a0)
+; RV32MV-NEXT:    slli a0, a1, 30
 ; RV32MV-NEXT:    lw a3, 4(s0)
-; RV32MV-NEXT:    srli s1, a1, 2
-; RV32MV-NEXT:    or s1, s1, a2
-; RV32MV-NEXT:    slli a2, a1, 31
-; RV32MV-NEXT:    srli a4, a3, 1
-; RV32MV-NEXT:    or s2, a4, a2
-; RV32MV-NEXT:    srli a0, a0, 2
-; RV32MV-NEXT:    slli a0, a0, 31
-; RV32MV-NEXT:    srai s3, a0, 31
-; RV32MV-NEXT:    srli a1, a1, 1
-; RV32MV-NEXT:    slli a1, a1, 31
+; RV32MV-NEXT:    srli s1, a2, 2
+; RV32MV-NEXT:    or s1, s1, a0
+; RV32MV-NEXT:    slli a4, a2, 31
 ; RV32MV-NEXT:    lw a0, 0(s0)
-; RV32MV-NEXT:    srai s4, a1, 31
+; RV32MV-NEXT:    srli a5, a3, 1
+; RV32MV-NEXT:    or s2, a5, a4
+; RV32MV-NEXT:    srli a1, a1, 2
+; RV32MV-NEXT:    slli a1, a1, 31
+; RV32MV-NEXT:    srai s3, a1, 31
+; RV32MV-NEXT:    srli a2, a2, 1
+; RV32MV-NEXT:    slli a2, a2, 31
+; RV32MV-NEXT:    srai s4, a2, 31
 ; RV32MV-NEXT:    slli a1, a3, 31
 ; RV32MV-NEXT:    srai a1, a1, 31
 ; RV32MV-NEXT:    li a2, 6
@@ -727,46 +727,46 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV:       # %bb.0:
 ; RV64MV-NEXT:    ld a1, 0(a0)
 ; RV64MV-NEXT:    lwu a2, 8(a0)
-; RV64MV-NEXT:    srli a3, a1, 2
-; RV64MV-NEXT:    lbu a4, 12(a0)
+; RV64MV-NEXT:    lbu a3, 12(a0)
+; RV64MV-NEXT:    srli a4, a1, 2
 ; RV64MV-NEXT:    slli a5, a2, 62
-; RV64MV-NEXT:    or a3, a5, a3
-; RV64MV-NEXT:    srai a3, a3, 31
-; RV64MV-NEXT:    slli a4, a4, 32
-; RV64MV-NEXT:    or a2, a2, a4
+; RV64MV-NEXT:    or a4, a5, a4
+; RV64MV-NEXT:    srai a4, a4, 31
+; RV64MV-NEXT:    slli a3, a3, 32
+; RV64MV-NEXT:    or a2, a2, a3
 ; RV64MV-NEXT:    slli a2, a2, 29
-; RV64MV-NEXT:    lui a4, %hi(.LCPI3_0)
-; RV64MV-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64MV-NEXT:    lui a3, %hi(.LCPI3_0)
+; RV64MV-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
 ; RV64MV-NEXT:    srai a2, a2, 31
 ; RV64MV-NEXT:    slli a1, a1, 31
 ; RV64MV-NEXT:    srai a1, a1, 31
-; RV64MV-NEXT:    mulh a4, a2, a4
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    mulh a3, a2, a3
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_1)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    slli a4, a4, 2
-; RV64MV-NEXT:    add a2, a2, a4
-; RV64MV-NEXT:    mulh a4, a3, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    slli a3, a3, 2
+; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    mulh a3, a4, a5
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    srai a3, a3, 1
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_2)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT:    add a3, a3, a4
-; RV64MV-NEXT:    slli a4, a4, 3
-; RV64MV-NEXT:    sub a3, a3, a4
-; RV64MV-NEXT:    mulh a4, a1, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    add a4, a4, a5
+; RV64MV-NEXT:    add a4, a4, a3
+; RV64MV-NEXT:    slli a3, a3, 3
+; RV64MV-NEXT:    sub a4, a4, a3
+; RV64MV-NEXT:    mulh a3, a1, a5
+; RV64MV-NEXT:    srli a5, a3, 63
+; RV64MV-NEXT:    add a3, a3, a5
 ; RV64MV-NEXT:    li a5, 6
-; RV64MV-NEXT:    mul a4, a4, a5
-; RV64MV-NEXT:    sub a1, a1, a4
+; RV64MV-NEXT:    mul a3, a3, a5
+; RV64MV-NEXT:    sub a1, a1, a3
 ; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64MV-NEXT:    vmv.v.x v8, a1
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a3
+; RV64MV-NEXT:    vslide1down.vx v8, v8, a4
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64MV-NEXT:    li a1, -1
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 3335ca3a34b6c6d..091b7d229a06c58 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -53,20 +53,20 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 12(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
 ; RV32IM-NEXT:    lh a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 507375
 ; RV32IM-NEXT:    addi a5, a5, 1981
 ; RV32IM-NEXT:    mulh a5, a1, a5
@@ -79,26 +79,26 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    lui a5, 342392
 ; RV32IM-NEXT:    addi a5, a5, 669
-; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    mulh a5, a4, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 5
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 780943
 ; RV32IM-NEXT:    addi a5, a5, 1809
-; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    mulh a5, a3, a5
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 8
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, -1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_1:
@@ -241,20 +241,20 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 8(a1)
-; RV32IM-NEXT:    lh a4, 0(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 12(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
 ; RV32IM-NEXT:    lh a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    sub a2, a2, a6
 ; RV32IM-NEXT:    mulh a6, a1, a5
 ; RV32IM-NEXT:    add a6, a6, a1
 ; RV32IM-NEXT:    srli t0, a6, 31
@@ -262,24 +262,24 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    mulh a6, a3, a5
-; RV32IM-NEXT:    add a6, a6, a3
+; RV32IM-NEXT:    mulh a6, a4, a5
+; RV32IM-NEXT:    add a6, a6, a4
 ; RV32IM-NEXT:    srli t0, a6, 31
 ; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, t0
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_srem_vec_2:
@@ -445,14 +445,14 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_srem_sdiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 0(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
 ; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
+; RV32IM-NEXT:    mulh a6, a2, a5
+; RV32IM-NEXT:    add a6, a6, a2
 ; RV32IM-NEXT:    srli a7, a6, 31
 ; RV32IM-NEXT:    srai a6, a6, 6
 ; RV32IM-NEXT:    add a6, a6, a7
@@ -464,30 +464,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    srai t1, t1, 6
 ; RV32IM-NEXT:    add t1, t1, t2
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulh t3, a3, a5
-; RV32IM-NEXT:    add t3, t3, a3
+; RV32IM-NEXT:    mulh t3, a4, a5
+; RV32IM-NEXT:    add t3, t3, a4
 ; RV32IM-NEXT:    srli t4, t3, 31
 ; RV32IM-NEXT:    srai t3, t3, 6
 ; RV32IM-NEXT:    add t3, t3, t4
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli t5, a5, 31
 ; RV32IM-NEXT:    srai a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, t5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_srem_sdiv:
@@ -655,36 +655,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 8(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
-; RV32IM-NEXT:    lh a4, 12(a1)
+; RV32IM-NEXT:    lh a2, 12(a1)
+; RV32IM-NEXT:    lh a3, 8(a1)
+; RV32IM-NEXT:    lh a4, 4(a1)
 ; RV32IM-NEXT:    lh a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    add a5, a5, a4
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a5, a5, a2
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    srli a5, a1, 26
 ; RV32IM-NEXT:    add a5, a1, a5
 ; RV32IM-NEXT:    andi a5, a5, -64
 ; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a3, 27
-; RV32IM-NEXT:    add a5, a3, a5
+; RV32IM-NEXT:    srli a5, a4, 27
+; RV32IM-NEXT:    add a5, a4, a5
 ; RV32IM-NEXT:    andi a5, a5, -32
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    srli a5, a2, 29
-; RV32IM-NEXT:    add a5, a2, a5
+; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    srli a5, a3, 29
+; RV32IM-NEXT:    add a5, a3, a5
 ; RV32IM-NEXT:    andi a5, a5, -8
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_power_of_two:
@@ -803,19 +803,19 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_srem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
+; RV32IM-NEXT:    lh a3, 12(a1)
 ; RV32IM-NEXT:    lh a1, 8(a1)
 ; RV32IM-NEXT:    lui a4, 820904
 ; RV32IM-NEXT:    addi a4, a4, -1903
-; RV32IM-NEXT:    mulh a4, a3, a4
-; RV32IM-NEXT:    add a4, a4, a3
+; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    add a4, a4, a2
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 9
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 729444
 ; RV32IM-NEXT:    addi a4, a4, 713
 ; RV32IM-NEXT:    mulh a4, a1, a4
@@ -828,18 +828,18 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    lui a4, 395996
 ; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    mulh a4, a3, a4
 ; RV32IM-NEXT:    srli a5, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 11
 ; RV32IM-NEXT:    add a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a3, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_srem_one:
@@ -1036,34 +1036,34 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lh a2, 16(a1)
 ; RV64IM-NEXT:    lui a3, %hi(.LCPI5_0)
 ; RV64IM-NEXT:    ld a3, %lo(.LCPI5_0)(a3)
-; RV64IM-NEXT:    lh a4, 24(a1)
+; RV64IM-NEXT:    lh a4, 8(a1)
+; RV64IM-NEXT:    lh a1, 24(a1)
 ; RV64IM-NEXT:    mulh a3, a2, a3
 ; RV64IM-NEXT:    add a3, a3, a2
 ; RV64IM-NEXT:    srli a5, a3, 63
 ; RV64IM-NEXT:    srli a3, a3, 4
 ; RV64IM-NEXT:    add a3, a3, a5
-; RV64IM-NEXT:    li a5, 23
-; RV64IM-NEXT:    lui a6, %hi(.LCPI5_1)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI5_1)(a6)
-; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    lh a1, 8(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
+; RV64IM-NEXT:    li a6, 23
+; RV64IM-NEXT:    mul a3, a3, a6
 ; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    mulh a3, a4, a6
+; RV64IM-NEXT:    mulh a3, a1, a5
 ; RV64IM-NEXT:    srli a5, a3, 63
 ; RV64IM-NEXT:    srli a3, a3, 11
 ; RV64IM-NEXT:    add a3, a3, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addi a5, a5, 1327
 ; RV64IM-NEXT:    mul a3, a3, a5
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    srli a3, a1, 49
-; RV64IM-NEXT:    add a3, a1, a3
+; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    srli a3, a4, 49
+; RV64IM-NEXT:    add a3, a4, a3
 ; RV64IM-NEXT:    lui a5, 8
 ; RV64IM-NEXT:    and a3, a3, a5
-; RV64IM-NEXT:    subw a1, a1, a3
+; RV64IM-NEXT:    subw a4, a4, a3
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a4, 6(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    sh a2, 4(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
@@ -1085,13 +1085,13 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
-; RV32I-NEXT:    lw s4, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
+; RV32I-NEXT:    lw s0, 28(a1)
+; RV32I-NEXT:    lw s1, 24(a1)
+; RV32I-NEXT:    lw s2, 20(a1)
+; RV32I-NEXT:    lw s3, 16(a1)
+; RV32I-NEXT:    lw s4, 12(a1)
 ; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw s5, 8(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
@@ -1101,23 +1101,23 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a0, s5
+; RV32I-NEXT:    mv a1, s4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __moddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
@@ -1154,13 +1154,13 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
-; RV32IM-NEXT:    lw s4, 8(a1)
-; RV32IM-NEXT:    lw s5, 12(a1)
+; RV32IM-NEXT:    lw s0, 28(a1)
+; RV32IM-NEXT:    lw s1, 24(a1)
+; RV32IM-NEXT:    lw s2, 20(a1)
+; RV32IM-NEXT:    lw s3, 16(a1)
+; RV32IM-NEXT:    lw s4, 12(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw s5, 8(a1)
 ; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
@@ -1170,23 +1170,23 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    mv a0, s5
+; RV32IM-NEXT:    mv a1, s4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s4, a0
 ; RV32IM-NEXT:    mv s5, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s2
-; RV32IM-NEXT:    mv a1, s3
+; RV32IM-NEXT:    mv a0, s3
+; RV32IM-NEXT:    mv a1, s2
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    mv s2, a0
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s1
+; RV32IM-NEXT:    mv a1, s0
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __moddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 8c0d97afe6c21d4..1525804be545c6d 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,47 +10,47 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 12(a1)
-; RISCV32-NEXT:    lw a7, 12(a2)
-; RISCV32-NEXT:    lw a6, 8(a1)
-; RISCV32-NEXT:    lw a4, 0(a2)
-; RISCV32-NEXT:    lw a5, 0(a1)
+; RISCV32-NEXT:    lw a3, 0(a2)
+; RISCV32-NEXT:    lw a4, 12(a1)
+; RISCV32-NEXT:    lw a5, 8(a1)
 ; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw t0, 8(a2)
-; RISCV32-NEXT:    lw a2, 4(a2)
-; RISCV32-NEXT:    mulhu a1, a5, a4
-; RISCV32-NEXT:    mul t1, t2, a4
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, t2, a4
+; RISCV32-NEXT:    lw a1, 0(a1)
+; RISCV32-NEXT:    lw a6, 12(a2)
+; RISCV32-NEXT:    lw a7, 8(a2)
+; RISCV32-NEXT:    lw t0, 4(a2)
+; RISCV32-NEXT:    mulhu a2, a1, a3
+; RISCV32-NEXT:    mul t1, t2, a3
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, t2, a3
 ; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a5, a2
-; RISCV32-NEXT:    add a1, t1, a1
-; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t3, a5, a2
+; RISCV32-NEXT:    mul t1, a1, t0
+; RISCV32-NEXT:    add a2, t1, a2
+; RISCV32-NEXT:    sltu t1, a2, t1
+; RISCV32-NEXT:    mulhu t3, a1, t0
 ; RISCV32-NEXT:    add t1, t3, t1
 ; RISCV32-NEXT:    add t5, t4, t1
-; RISCV32-NEXT:    mul t6, t2, a2
+; RISCV32-NEXT:    mul t6, t2, t0
 ; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, t0, a5
-; RISCV32-NEXT:    mul s3, a6, a4
+; RISCV32-NEXT:    mul t1, a7, a1
+; RISCV32-NEXT:    mul s3, a5, a3
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t3, t1, s0
 ; RISCV32-NEXT:    sltu s0, s0, t6
 ; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, t2, a2
+; RISCV32-NEXT:    mulhu t5, t2, t0
 ; RISCV32-NEXT:    add t4, t5, t4
 ; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, t0
-; RISCV32-NEXT:    mul t5, a7, a5
+; RISCV32-NEXT:    mul t4, t2, a7
+; RISCV32-NEXT:    mul t5, a6, a1
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, t0, a5
+; RISCV32-NEXT:    mulhu s1, a7, a1
 ; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, a2, a6
-; RISCV32-NEXT:    mul t5, a3, a4
+; RISCV32-NEXT:    mul t4, t0, a5
+; RISCV32-NEXT:    mul t5, a4, a3
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a6, a4
+; RISCV32-NEXT:    mulhu t5, a5, a3
 ; RISCV32-NEXT:    add t6, t5, t4
 ; RISCV32-NEXT:    add t4, t6, s2
 ; RISCV32-NEXT:    sltu s3, s4, s3
@@ -63,41 +63,41 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:  .LBB0_2: # %start
 ; RISCV32-NEXT:    sltu s0, s2, s1
 ; RISCV32-NEXT:    snez s1, t2
-; RISCV32-NEXT:    snez s2, a7
+; RISCV32-NEXT:    snez s2, a6
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a7, a5
+; RISCV32-NEXT:    mulhu s2, a6, a1
 ; RISCV32-NEXT:    snez s2, s2
 ; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, t0
+; RISCV32-NEXT:    mulhu t2, t2, a7
 ; RISCV32-NEXT:    snez t2, t2
 ; RISCV32-NEXT:    or t2, s1, t2
 ; RISCV32-NEXT:    or t2, t2, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    snez t6, a2
-; RISCV32-NEXT:    snez s0, a3
+; RISCV32-NEXT:    snez t6, t0
+; RISCV32-NEXT:    snez s0, a4
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a3, a4
+; RISCV32-NEXT:    mulhu s0, a4, a3
 ; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu a2, a2, a6
-; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    or a2, t6, a2
-; RISCV32-NEXT:    or a2, a2, t5
-; RISCV32-NEXT:    or a7, t0, a7
-; RISCV32-NEXT:    snez a7, a7
-; RISCV32-NEXT:    or a3, a6, a3
-; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    and a3, a3, a7
-; RISCV32-NEXT:    or a2, a3, a2
-; RISCV32-NEXT:    or a3, t2, t3
-; RISCV32-NEXT:    or a2, a2, a3
-; RISCV32-NEXT:    mul a3, a5, a4
-; RISCV32-NEXT:    andi a2, a2, 1
-; RISCV32-NEXT:    sw a3, 0(a0)
-; RISCV32-NEXT:    sw a1, 4(a0)
+; RISCV32-NEXT:    mulhu t0, t0, a5
+; RISCV32-NEXT:    snez t0, t0
+; RISCV32-NEXT:    or t0, t6, t0
+; RISCV32-NEXT:    or t0, t0, t5
+; RISCV32-NEXT:    or a6, a7, a6
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    or a4, a5, a4
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a4, a4, a6
+; RISCV32-NEXT:    or a4, a4, t0
+; RISCV32-NEXT:    or a5, t2, t3
+; RISCV32-NEXT:    or a4, a4, a5
+; RISCV32-NEXT:    mul a1, a1, a3
+; RISCV32-NEXT:    andi a4, a4, 1
+; RISCV32-NEXT:    sw a1, 0(a0)
+; RISCV32-NEXT:    sw a2, 4(a0)
 ; RISCV32-NEXT:    sw t1, 8(a0)
 ; RISCV32-NEXT:    sw t4, 12(a0)
-; RISCV32-NEXT:    sb a2, 16(a0)
+; RISCV32-NEXT:    sb a4, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index ce0d8fedbfb88f2..9cc835da3cc29e7 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -85,50 +85,49 @@ define i32 @load_i32(ptr %p) {
 define i64 @load_i64(ptr %p) {
 ; RV32I-LABEL: load_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 0(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    lbu a2, 1(a0)
 ; RV32I-NEXT:    lbu a3, 2(a0)
 ; RV32I-NEXT:    lbu a4, 3(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
+; RV32I-NEXT:    slli a2, a2, 8
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a2, a4, a3
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    or a0, a4, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a1, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a2, t0, a7
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: load_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 0(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    lbu a2, 1(a0)
 ; RV64I-NEXT:    lbu a3, 2(a0)
 ; RV64I-NEXT:    lbu a4, 3(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a2, a2, 8
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    lbu a2, 5(a0)
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a2, a2, 8
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a2, a6, a5
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 32aca29d16e9b9b..8fc4465ffab1fca 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -54,16 +54,16 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    lui a5, 8456
 ; RV32IM-NEXT:    addi a5, a5, 1058
 ; RV32IM-NEXT:    mulhu a5, a1, a5
@@ -72,20 +72,20 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    lui a5, 10700
 ; RV32IM-NEXT:    addi a5, a5, -1003
-; RV32IM-NEXT:    mulhu a5, a3, a5
+; RV32IM-NEXT:    mulhu a5, a4, a5
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sub a4, a4, a5
 ; RV32IM-NEXT:    lui a5, 1045
 ; RV32IM-NEXT:    addi a5, a5, 1801
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    li a6, 1003
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_1:
@@ -214,29 +214,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_urem_vec_2:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 0(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 4(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    sub a2, a2, a6
 ; RV32IM-NEXT:    mulhu a6, a1, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a1, a1, a6
-; RV32IM-NEXT:    mulhu a6, a3, a5
+; RV32IM-NEXT:    mulhu a6, a4, a5
 ; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    mul a5, a5, a7
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sh a2, 6(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sub a3, a3, a5
+; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 2(a0)
-; RV32IM-NEXT:    sh a4, 0(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: fold_urem_vec_2:
@@ -386,33 +386,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_urem_udiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 0(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 0(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a4, a5
+; RV32IM-NEXT:    mulhu a6, a2, a5
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul t0, a6, a7
 ; RV32IM-NEXT:    mulhu t1, a1, a5
 ; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulhu t3, a3, a5
+; RV32IM-NEXT:    mulhu t3, a4, a5
 ; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mulhu a5, a3, a5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a2, a2, a5
-; RV32IM-NEXT:    sub a2, a2, a7
-; RV32IM-NEXT:    add a3, a3, t3
-; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    add a4, a4, t3
+; RV32IM-NEXT:    sub a4, a4, t4
 ; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a4, a4, a6
-; RV32IM-NEXT:    sub a4, a4, t0
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    add a2, a2, a6
+; RV32IM-NEXT:    sub a2, a2, t0
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
-; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
+; RV32IM-NEXT:    sh a3, 0(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_urem_udiv:
@@ -531,10 +531,10 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    lhu s1, 8(a1)
 ; RV32I-NEXT:    lhu s2, 4(a1)
 ; RV32I-NEXT:    lhu s3, 0(a1)
-; RV32I-NEXT:    lhu a2, 12(a1)
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 95
 ; RV32I-NEXT:    mv a0, a2
@@ -556,23 +556,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 8(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 8(a1)
+; RV32IM-NEXT:    lhu a4, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 0(a1)
 ; RV32IM-NEXT:    lui a5, 11038
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a3, a3, 31
-; RV32IM-NEXT:    andi a2, a2, 7
-; RV32IM-NEXT:    sh a2, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    andi a4, a4, 31
+; RV32IM-NEXT:    andi a3, a3, 7
+; RV32IM-NEXT:    sh a3, 4(a0)
+; RV32IM-NEXT:    sh a4, 2(a0)
 ; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_power_of_two:
@@ -583,10 +583,10 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    lhu s1, 16(a1)
 ; RV64I-NEXT:    lhu s2, 8(a1)
 ; RV64I-NEXT:    lhu s3, 0(a1)
-; RV64I-NEXT:    lhu a2, 24(a1)
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    mv a0, a2
@@ -670,15 +670,15 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a2, 4(a1)
+; RV32IM-NEXT:    lhu a3, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
 ; RV32IM-NEXT:    lui a4, 1603
 ; RV32IM-NEXT:    addi a4, a4, 1341
-; RV32IM-NEXT:    mulhu a4, a3, a4
+; RV32IM-NEXT:    mulhu a4, a2, a4
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    lui a4, 45590
 ; RV32IM-NEXT:    addi a4, a4, 1069
 ; RV32IM-NEXT:    mulhu a4, a1, a4
@@ -687,15 +687,15 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a4
 ; RV32IM-NEXT:    lui a4, 193
 ; RV32IM-NEXT:    addi a4, a4, 1464
-; RV32IM-NEXT:    mulhu a4, a2, a4
+; RV32IM-NEXT:    mulhu a4, a3, a4
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a3, a3, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a3, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_one:
@@ -791,13 +791,13 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lw s0, 24(a1)
-; RV32I-NEXT:    lw s1, 28(a1)
-; RV32I-NEXT:    lw s2, 16(a1)
-; RV32I-NEXT:    lw s3, 20(a1)
-; RV32I-NEXT:    lw s4, 8(a1)
-; RV32I-NEXT:    lw s5, 12(a1)
+; RV32I-NEXT:    lw s0, 28(a1)
+; RV32I-NEXT:    lw s1, 24(a1)
+; RV32I-NEXT:    lw s2, 20(a1)
+; RV32I-NEXT:    lw s3, 16(a1)
+; RV32I-NEXT:    lw s4, 12(a1)
 ; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw s5, 8(a1)
 ; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    mv s6, a0
 ; RV32I-NEXT:    li a2, 1
@@ -807,23 +807,23 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32I-NEXT:    mv s7, a0
 ; RV32I-NEXT:    mv s8, a1
 ; RV32I-NEXT:    li a2, 654
-; RV32I-NEXT:    mv a0, s4
-; RV32I-NEXT:    mv a1, s5
+; RV32I-NEXT:    mv a0, s5
+; RV32I-NEXT:    mv a1, s4
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    li a2, 23
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv a0, s3
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a2, a0, 1327
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __umoddi3 at plt
 ; RV32I-NEXT:    sw a1, 28(s6)
@@ -860,13 +860,13 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    sw s6, 16(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw s0, 24(a1)
-; RV32IM-NEXT:    lw s1, 28(a1)
-; RV32IM-NEXT:    lw s2, 16(a1)
-; RV32IM-NEXT:    lw s3, 20(a1)
-; RV32IM-NEXT:    lw s4, 8(a1)
-; RV32IM-NEXT:    lw s5, 12(a1)
+; RV32IM-NEXT:    lw s0, 28(a1)
+; RV32IM-NEXT:    lw s1, 24(a1)
+; RV32IM-NEXT:    lw s2, 20(a1)
+; RV32IM-NEXT:    lw s3, 16(a1)
+; RV32IM-NEXT:    lw s4, 12(a1)
 ; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw s5, 8(a1)
 ; RV32IM-NEXT:    lw a1, 4(a1)
 ; RV32IM-NEXT:    mv s6, a0
 ; RV32IM-NEXT:    li a2, 1
@@ -876,23 +876,23 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ; RV32IM-NEXT:    mv s7, a0
 ; RV32IM-NEXT:    mv s8, a1
 ; RV32IM-NEXT:    li a2, 654
-; RV32IM-NEXT:    mv a0, s4
-; RV32IM-NEXT:    mv a1, s5
+; RV32IM-NEXT:    mv a0, s5
+; RV32IM-NEXT:    mv a1, s4
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s4, a0
 ; RV32IM-NEXT:    mv s5, a1
 ; RV32IM-NEXT:    li a2, 23
-; RV32IM-NEXT:    mv a0, s2
-; RV32IM-NEXT:    mv a1, s3
+; RV32IM-NEXT:    mv a0, s3
+; RV32IM-NEXT:    mv a1, s2
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    mv s2, a0
 ; RV32IM-NEXT:    mv s3, a1
 ; RV32IM-NEXT:    lui a0, 1
 ; RV32IM-NEXT:    addi a2, a0, 1327
-; RV32IM-NEXT:    mv a0, s0
-; RV32IM-NEXT:    mv a1, s1
+; RV32IM-NEXT:    mv a0, s1
+; RV32IM-NEXT:    mv a1, s0
 ; RV32IM-NEXT:    li a3, 0
 ; RV32IM-NEXT:    call __umoddi3 at plt
 ; RV32IM-NEXT:    sw a1, 28(s6)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92bd4..ed5a522a8a74602 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -38,17 +38,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -102,17 +102,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -136,8 +136,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -166,17 +166,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -198,47 +198,47 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -272,17 +272,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
@@ -334,47 +334,47 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -408,17 +408,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
@@ -470,47 +470,47 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 6(a1)
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 6(a1)
 ; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -544,18 +544,18 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    or a5, a1, a6
+; RV32I-NEXT:    slli a5, a5, 3
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -607,47 +607,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -659,25 +659,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
 ; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -779,38 +779,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    lbu a3, 14(a0)
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 8(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 5(a0)
+; RV32I-NEXT:    lbu t5, 4(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s0, 2(a0)
+; RV32I-NEXT:    lbu s1, 1(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    sb a7, 10(a2)
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb s0, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb s1, 1(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t2, 7(a2)
+; RV32I-NEXT:    sb t5, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -826,47 +826,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 6(a1)
 ; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -878,25 +878,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
 ; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -998,38 +998,38 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    addi a0, sp, 20
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    lbu a3, 14(a0)
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 8(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 5(a0)
+; RV32I-NEXT:    lbu t5, 4(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s0, 2(a0)
+; RV32I-NEXT:    lbu s1, 1(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    sb a7, 10(a2)
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb s0, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb s1, 1(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t2, 7(a2)
+; RV32I-NEXT:    sb t5, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1045,47 +1045,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 5(a1)
 ; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 6(a1)
 ; RV64I-NEXT:    lbu t0, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    lbu t1, 0(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a5, a5, 35
@@ -1099,25 +1099,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
 ; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t3
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
@@ -1167,94 +1167,94 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu a0, 13(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t4, 8(a0)
+; RV32I-NEXT:    lbu t5, 9(a0)
+; RV32I-NEXT:    lbu t6, 10(a0)
+; RV32I-NEXT:    lbu s0, 11(a0)
+; RV32I-NEXT:    lbu s1, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu a0, 14(a0)
+; RV32I-NEXT:    slli s3, a3, 24
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sb a3, 15(sp)
-; RV32I-NEXT:    sb s3, 14(sp)
-; RV32I-NEXT:    sb a0, 13(sp)
-; RV32I-NEXT:    sb s2, 12(sp)
-; RV32I-NEXT:    sb s1, 11(sp)
-; RV32I-NEXT:    sb s0, 10(sp)
-; RV32I-NEXT:    sb t6, 9(sp)
-; RV32I-NEXT:    sb t5, 8(sp)
-; RV32I-NEXT:    sb t4, 7(sp)
-; RV32I-NEXT:    sb t3, 6(sp)
-; RV32I-NEXT:    sb t2, 5(sp)
-; RV32I-NEXT:    sb t1, 4(sp)
-; RV32I-NEXT:    sb t0, 3(sp)
-; RV32I-NEXT:    sb a7, 2(sp)
-; RV32I-NEXT:    sb a6, 1(sp)
-; RV32I-NEXT:    sb a5, 0(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a4, 16(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a0, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
+; RV32I-NEXT:    sb a0, 14(sp)
+; RV32I-NEXT:    sb s2, 13(sp)
+; RV32I-NEXT:    sb s1, 12(sp)
+; RV32I-NEXT:    sb s0, 11(sp)
+; RV32I-NEXT:    sb t6, 10(sp)
+; RV32I-NEXT:    sb t5, 9(sp)
+; RV32I-NEXT:    sb t4, 8(sp)
+; RV32I-NEXT:    sb t3, 7(sp)
+; RV32I-NEXT:    sb t2, 6(sp)
+; RV32I-NEXT:    sb t1, 5(sp)
+; RV32I-NEXT:    sb t0, 4(sp)
+; RV32I-NEXT:    sb a7, 3(sp)
+; RV32I-NEXT:    sb a6, 2(sp)
+; RV32I-NEXT:    sb a5, 1(sp)
+; RV32I-NEXT:    sb a4, 0(sp)
+; RV32I-NEXT:    srai a0, s3, 31
+; RV32I-NEXT:    sb a0, 28(sp)
+; RV32I-NEXT:    sb a0, 24(sp)
+; RV32I-NEXT:    sb a0, 20(sp)
+; RV32I-NEXT:    sb a0, 16(sp)
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a0, 25(sp)
+; RV32I-NEXT:    sb a3, 23(sp)
+; RV32I-NEXT:    sb a4, 22(sp)
+; RV32I-NEXT:    sb a0, 21(sp)
+; RV32I-NEXT:    sb a3, 19(sp)
+; RV32I-NEXT:    sb a4, 18(sp)
+; RV32I-NEXT:    sb a0, 17(sp)
 ; RV32I-NEXT:    andi a1, a1, 15
 ; RV32I-NEXT:    mv a0, sp
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    lbu a3, 14(a0)
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 8(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 5(a0)
+; RV32I-NEXT:    lbu t5, 4(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s0, 2(a0)
+; RV32I-NEXT:    lbu s1, 1(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    sb a7, 10(a2)
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    sb t1, 8(a2)
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    sb s0, 2(a2)
+; RV32I-NEXT:    sb t6, 3(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb s1, 1(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t2, 7(a2)
+; RV32I-NEXT:    sb t5, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
 ; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
@@ -1286,18 +1286,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a5, a1
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu a6, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1318,19 +1321,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    lbu ra, 24(a0)
 ; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
 ; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 87(sp)
-; RV64I-NEXT:    sb a4, 86(sp)
+; RV64I-NEXT:    lbu a5, 0(a5)
+; RV64I-NEXT:    sb a6, 87(sp)
+; RV64I-NEXT:    sb a7, 86(sp)
 ; RV64I-NEXT:    sb a0, 85(sp)
-; RV64I-NEXT:    sb a5, 84(sp)
-; RV64I-NEXT:    sb a6, 83(sp)
-; RV64I-NEXT:    sb a7, 82(sp)
+; RV64I-NEXT:    sb a1, 84(sp)
+; RV64I-NEXT:    sb a3, 83(sp)
+; RV64I-NEXT:    sb a4, 82(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1395,83 +1396,83 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 57(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    add a0, a0, a5
+; RV64I-NEXT:    lbu t5, 25(a0)
+; RV64I-NEXT:    lbu t4, 26(a0)
+; RV64I-NEXT:    lbu t3, 27(a0)
+; RV64I-NEXT:    lbu t2, 28(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 10(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 11(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 13(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t6, 14(a0)
+; RV64I-NEXT:    lbu s0, 15(a0)
+; RV64I-NEXT:    lbu s1, 16(a0)
+; RV64I-NEXT:    lbu s2, 17(a0)
+; RV64I-NEXT:    lbu s3, 18(a0)
+; RV64I-NEXT:    lbu s4, 19(a0)
+; RV64I-NEXT:    lbu s5, 20(a0)
+; RV64I-NEXT:    lbu s6, 21(a0)
+; RV64I-NEXT:    lbu s7, 22(a0)
+; RV64I-NEXT:    lbu s8, 24(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
+; RV64I-NEXT:    lbu s10, 0(a0)
+; RV64I-NEXT:    lbu s11, 1(a0)
+; RV64I-NEXT:    lbu ra, 2(a0)
+; RV64I-NEXT:    lbu a5, 3(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a1, 6(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    sb s9, 23(a2)
+; RV64I-NEXT:    sb s7, 22(a2)
+; RV64I-NEXT:    sb s6, 21(a2)
+; RV64I-NEXT:    sb s5, 20(a2)
+; RV64I-NEXT:    sb s4, 19(a2)
+; RV64I-NEXT:    sb s3, 18(a2)
+; RV64I-NEXT:    sb s2, 17(a2)
+; RV64I-NEXT:    sb s1, 16(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a7, 30(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t2, 28(a2)
+; RV64I-NEXT:    sb t3, 27(a2)
+; RV64I-NEXT:    sb t4, 26(a2)
+; RV64I-NEXT:    sb t5, 25(a2)
+; RV64I-NEXT:    sb s8, 24(a2)
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb ra, 2(a2)
+; RV64I-NEXT:    sb s11, 1(a2)
+; RV64I-NEXT:    sb s10, 0(a2)
+; RV64I-NEXT:    sb s0, 15(a2)
+; RV64I-NEXT:    sb t6, 14(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb a0, 13(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 11(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a0, 10(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a6, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1504,18 +1505,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1536,19 +1540,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    lbu ra, 24(a0)
 ; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
 ; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 59(sp)
-; RV32I-NEXT:    sb a4, 58(sp)
+; RV32I-NEXT:    lbu a5, 0(a5)
+; RV32I-NEXT:    sb a6, 59(sp)
+; RV32I-NEXT:    sb a7, 58(sp)
 ; RV32I-NEXT:    sb a0, 57(sp)
-; RV32I-NEXT:    sb a5, 56(sp)
-; RV32I-NEXT:    sb a6, 55(sp)
-; RV32I-NEXT:    sb a7, 54(sp)
+; RV32I-NEXT:    sb a1, 56(sp)
+; RV32I-NEXT:    sb a3, 55(sp)
+; RV32I-NEXT:    sb a4, 54(sp)
 ; RV32I-NEXT:    sb zero, 91(sp)
 ; RV32I-NEXT:    sb zero, 90(sp)
 ; RV32I-NEXT:    sb zero, 89(sp)
@@ -1613,83 +1615,83 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 29(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 28
-; RV32I-NEXT:    add a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    andi a0, a5, 31
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a1, 23(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 24(a0)
+; RV32I-NEXT:    lbu t1, 26(a0)
+; RV32I-NEXT:    lbu t2, 25(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 19(a0)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 16(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 17(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a1, 7(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s9, 8(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 9(a0)
+; RV32I-NEXT:    lbu ra, 6(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb s0, 29(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    sb s3, 17(a2)
+; RV32I-NEXT:    sb s1, 16(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    sb s2, 18(a2)
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    sb s8, 22(a2)
+; RV32I-NEXT:    sb s11, 9(a2)
+; RV32I-NEXT:    sb s9, 8(a2)
+; RV32I-NEXT:    sb s7, 11(a2)
+; RV32I-NEXT:    sb s10, 10(a2)
+; RV32I-NEXT:    sb s5, 13(a2)
+; RV32I-NEXT:    sb s6, 12(a2)
+; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    sb s4, 14(a2)
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb ra, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
@@ -1729,18 +1731,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a5, a1
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu a6, 31(a0)
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1761,19 +1766,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    lbu ra, 24(a0)
 ; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a3, 27(a0)
+; RV64I-NEXT:    lbu a1, 28(a0)
 ; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 119(sp)
-; RV64I-NEXT:    sb a4, 118(sp)
+; RV64I-NEXT:    lbu a5, 0(a5)
+; RV64I-NEXT:    sb a6, 119(sp)
+; RV64I-NEXT:    sb a7, 118(sp)
 ; RV64I-NEXT:    sb a0, 117(sp)
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb a1, 116(sp)
+; RV64I-NEXT:    sb a3, 115(sp)
+; RV64I-NEXT:    sb a4, 114(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -1838,83 +1841,83 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 89(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    andi a1, a1, 31
+; RV64I-NEXT:    andi a5, a5, 31
 ; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    sub a0, a0, a5
+; RV64I-NEXT:    lbu t5, 25(a0)
+; RV64I-NEXT:    lbu t4, 26(a0)
+; RV64I-NEXT:    lbu t3, 27(a0)
+; RV64I-NEXT:    lbu t2, 28(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 10(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 11(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 13(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t6, 14(a0)
+; RV64I-NEXT:    lbu s0, 15(a0)
+; RV64I-NEXT:    lbu s1, 16(a0)
+; RV64I-NEXT:    lbu s2, 17(a0)
+; RV64I-NEXT:    lbu s3, 18(a0)
+; RV64I-NEXT:    lbu s4, 19(a0)
+; RV64I-NEXT:    lbu s5, 20(a0)
+; RV64I-NEXT:    lbu s6, 21(a0)
+; RV64I-NEXT:    lbu s7, 22(a0)
+; RV64I-NEXT:    lbu s8, 24(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
+; RV64I-NEXT:    lbu s10, 0(a0)
+; RV64I-NEXT:    lbu s11, 1(a0)
+; RV64I-NEXT:    lbu ra, 2(a0)
+; RV64I-NEXT:    lbu a5, 3(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a1, 6(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    sb s9, 23(a2)
+; RV64I-NEXT:    sb s7, 22(a2)
+; RV64I-NEXT:    sb s6, 21(a2)
+; RV64I-NEXT:    sb s5, 20(a2)
+; RV64I-NEXT:    sb s4, 19(a2)
+; RV64I-NEXT:    sb s3, 18(a2)
+; RV64I-NEXT:    sb s2, 17(a2)
+; RV64I-NEXT:    sb s1, 16(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a7, 30(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t2, 28(a2)
+; RV64I-NEXT:    sb t3, 27(a2)
+; RV64I-NEXT:    sb t4, 26(a2)
+; RV64I-NEXT:    sb t5, 25(a2)
+; RV64I-NEXT:    sb s8, 24(a2)
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb ra, 2(a2)
+; RV64I-NEXT:    sb s11, 1(a2)
+; RV64I-NEXT:    sb s10, 0(a2)
+; RV64I-NEXT:    sb s0, 15(a2)
+; RV64I-NEXT:    sb t6, 14(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb a0, 13(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 11(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a0, 10(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a6, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1947,18 +1950,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1979,19 +1985,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    lbu ra, 24(a0)
 ; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
+; RV32I-NEXT:    lbu a4, 26(a0)
+; RV32I-NEXT:    lbu a3, 27(a0)
+; RV32I-NEXT:    lbu a1, 28(a0)
 ; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 91(sp)
-; RV32I-NEXT:    sb a4, 90(sp)
+; RV32I-NEXT:    lbu a5, 0(a5)
+; RV32I-NEXT:    sb a6, 91(sp)
+; RV32I-NEXT:    sb a7, 90(sp)
 ; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a5, 88(sp)
-; RV32I-NEXT:    sb a6, 87(sp)
-; RV32I-NEXT:    sb a7, 86(sp)
+; RV32I-NEXT:    sb a1, 88(sp)
+; RV32I-NEXT:    sb a3, 87(sp)
+; RV32I-NEXT:    sb a4, 86(sp)
 ; RV32I-NEXT:    sb zero, 59(sp)
 ; RV32I-NEXT:    sb zero, 58(sp)
 ; RV32I-NEXT:    sb zero, 57(sp)
@@ -2056,83 +2060,83 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a0, 61(sp)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    andi a1, a1, 31
+; RV32I-NEXT:    andi a5, a5, 31
 ; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sub a0, a0, a5
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a1, 23(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 24(a0)
+; RV32I-NEXT:    lbu t1, 26(a0)
+; RV32I-NEXT:    lbu t2, 25(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 19(a0)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 16(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 17(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a1, 7(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s9, 8(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 9(a0)
+; RV32I-NEXT:    lbu ra, 6(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb s0, 29(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    sb s3, 17(a2)
+; RV32I-NEXT:    sb s1, 16(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    sb s2, 18(a2)
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    sb s8, 22(a2)
+; RV32I-NEXT:    sb s11, 9(a2)
+; RV32I-NEXT:    sb s9, 8(a2)
+; RV32I-NEXT:    sb s7, 11(a2)
+; RV32I-NEXT:    sb s10, 10(a2)
+; RV32I-NEXT:    sb s5, 13(a2)
+; RV32I-NEXT:    sb s6, 12(a2)
+; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    sb s4, 14(a2)
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb ra, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
@@ -2172,86 +2176,87 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t0, a1
-; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    mv t3, a1
+; RV64I-NEXT:    lbu t2, 29(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
 ; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    lbu a1, 2(a0)
 ; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    lbu a1, 3(a0)
 ; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    lbu a1, 4(a0)
 ; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    lbu t4, 8(a0)
-; RV64I-NEXT:    lbu t5, 9(a0)
-; RV64I-NEXT:    lbu t6, 10(a0)
-; RV64I-NEXT:    lbu s0, 11(a0)
-; RV64I-NEXT:    lbu s1, 12(a0)
-; RV64I-NEXT:    lbu s2, 13(a0)
-; RV64I-NEXT:    lbu s3, 14(a0)
-; RV64I-NEXT:    lbu s4, 15(a0)
-; RV64I-NEXT:    lbu s5, 16(a0)
-; RV64I-NEXT:    lbu s6, 17(a0)
-; RV64I-NEXT:    lbu s7, 18(a0)
-; RV64I-NEXT:    lbu s8, 19(a0)
-; RV64I-NEXT:    lbu s9, 20(a0)
-; RV64I-NEXT:    lbu s10, 21(a0)
-; RV64I-NEXT:    lbu s11, 22(a0)
-; RV64I-NEXT:    lbu ra, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a4, 27(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu t5, 7(a0)
+; RV64I-NEXT:    lbu t6, 8(a0)
+; RV64I-NEXT:    lbu s0, 9(a0)
+; RV64I-NEXT:    lbu s1, 10(a0)
+; RV64I-NEXT:    lbu s2, 11(a0)
+; RV64I-NEXT:    lbu s3, 12(a0)
+; RV64I-NEXT:    lbu s4, 13(a0)
+; RV64I-NEXT:    lbu s5, 14(a0)
+; RV64I-NEXT:    lbu s6, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu s11, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu a6, 23(a0)
+; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t0, 0(t0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
+; RV64I-NEXT:    lbu t3, 0(t3)
+; RV64I-NEXT:    sd t3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sb t1, 86(sp)
+; RV64I-NEXT:    sb t2, 85(sp)
 ; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a4, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb ra, 79(sp)
-; RV64I-NEXT:    sb s11, 78(sp)
-; RV64I-NEXT:    sb s10, 77(sp)
-; RV64I-NEXT:    sb s9, 76(sp)
-; RV64I-NEXT:    sb s8, 75(sp)
-; RV64I-NEXT:    sb s7, 74(sp)
-; RV64I-NEXT:    sb s6, 73(sp)
-; RV64I-NEXT:    sb s5, 72(sp)
-; RV64I-NEXT:    sb s4, 71(sp)
-; RV64I-NEXT:    sb s3, 70(sp)
-; RV64I-NEXT:    sb s2, 69(sp)
-; RV64I-NEXT:    sb s1, 68(sp)
-; RV64I-NEXT:    sb s0, 67(sp)
-; RV64I-NEXT:    sb t6, 66(sp)
-; RV64I-NEXT:    sb t5, 65(sp)
-; RV64I-NEXT:    sb t4, 64(sp)
-; RV64I-NEXT:    sb t3, 63(sp)
-; RV64I-NEXT:    sb t2, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 83(sp)
+; RV64I-NEXT:    sb a3, 82(sp)
+; RV64I-NEXT:    sb a4, 81(sp)
+; RV64I-NEXT:    sb t0, 87(sp)
+; RV64I-NEXT:    slli t0, t0, 56
+; RV64I-NEXT:    sb a5, 80(sp)
+; RV64I-NEXT:    sb a6, 79(sp)
+; RV64I-NEXT:    sb a7, 78(sp)
+; RV64I-NEXT:    sb ra, 77(sp)
+; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    sb s10, 75(sp)
+; RV64I-NEXT:    sb s9, 74(sp)
+; RV64I-NEXT:    sb s8, 73(sp)
+; RV64I-NEXT:    sb s7, 72(sp)
+; RV64I-NEXT:    sb s6, 71(sp)
+; RV64I-NEXT:    sb s5, 70(sp)
+; RV64I-NEXT:    sb s4, 69(sp)
+; RV64I-NEXT:    sb s3, 68(sp)
+; RV64I-NEXT:    sb s2, 67(sp)
+; RV64I-NEXT:    sb s1, 66(sp)
+; RV64I-NEXT:    sb s0, 65(sp)
+; RV64I-NEXT:    sb t6, 64(sp)
+; RV64I-NEXT:    sb t5, 63(sp)
+; RV64I-NEXT:    sb t4, 62(sp)
+; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
+; RV64I-NEXT:    srai a0, t0, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2291,83 +2296,84 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    andi a0, t0, 31
+; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    andi a0, a0, 31
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a6, a1, a0
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu t5, 25(a0)
+; RV64I-NEXT:    lbu t4, 26(a0)
+; RV64I-NEXT:    lbu t3, 27(a0)
+; RV64I-NEXT:    lbu t2, 28(a0)
+; RV64I-NEXT:    lbu t1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu a1, 9(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 10(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 11(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 13(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t6, 14(a0)
+; RV64I-NEXT:    lbu s0, 15(a0)
+; RV64I-NEXT:    lbu s1, 16(a0)
+; RV64I-NEXT:    lbu s2, 17(a0)
+; RV64I-NEXT:    lbu s3, 18(a0)
+; RV64I-NEXT:    lbu s4, 19(a0)
+; RV64I-NEXT:    lbu s5, 20(a0)
+; RV64I-NEXT:    lbu s6, 21(a0)
+; RV64I-NEXT:    lbu s7, 22(a0)
+; RV64I-NEXT:    lbu s8, 24(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
+; RV64I-NEXT:    lbu s10, 0(a0)
+; RV64I-NEXT:    lbu s11, 1(a0)
+; RV64I-NEXT:    lbu ra, 2(a0)
+; RV64I-NEXT:    lbu a5, 3(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a1, 6(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    sb s9, 23(a2)
+; RV64I-NEXT:    sb s7, 22(a2)
+; RV64I-NEXT:    sb s6, 21(a2)
+; RV64I-NEXT:    sb s5, 20(a2)
+; RV64I-NEXT:    sb s4, 19(a2)
+; RV64I-NEXT:    sb s3, 18(a2)
+; RV64I-NEXT:    sb s2, 17(a2)
+; RV64I-NEXT:    sb s1, 16(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a7, 30(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t2, 28(a2)
+; RV64I-NEXT:    sb t3, 27(a2)
+; RV64I-NEXT:    sb t4, 26(a2)
+; RV64I-NEXT:    sb t5, 25(a2)
+; RV64I-NEXT:    sb s8, 24(a2)
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb ra, 2(a2)
+; RV64I-NEXT:    sb s11, 1(a2)
+; RV64I-NEXT:    sb s10, 0(a2)
+; RV64I-NEXT:    sb s0, 15(a2)
+; RV64I-NEXT:    sb t6, 14(a2)
 ; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb a0, 13(a2)
 ; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
 ; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
+; RV64I-NEXT:    sb a0, 11(a2)
 ; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a0, 10(a2)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    sb a6, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -2400,86 +2406,87 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    lbu t1, 31(a0)
+; RV32I-NEXT:    mv t3, a1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t0, 31(a0)
+; RV32I-NEXT:    lbu t1, 30(a0)
 ; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a1, 1(a0)
 ; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
 ; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a1, 3(a0)
 ; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    lbu a1, 4(a0)
 ; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t3, 7(a0)
-; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s6, 17(a0)
-; RV32I-NEXT:    lbu s7, 18(a0)
-; RV32I-NEXT:    lbu s8, 19(a0)
-; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 21(a0)
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a4, 27(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu t5, 7(a0)
+; RV32I-NEXT:    lbu t6, 8(a0)
+; RV32I-NEXT:    lbu s0, 9(a0)
+; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s2, 11(a0)
+; RV32I-NEXT:    lbu s3, 12(a0)
+; RV32I-NEXT:    lbu s4, 13(a0)
+; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s7, 16(a0)
+; RV32I-NEXT:    lbu s8, 17(a0)
+; RV32I-NEXT:    lbu s9, 18(a0)
+; RV32I-NEXT:    lbu s10, 19(a0)
+; RV32I-NEXT:    lbu s11, 20(a0)
+; RV32I-NEXT:    lbu ra, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu a6, 23(a0)
+; RV32I-NEXT:    lbu a5, 24(a0)
+; RV32I-NEXT:    lbu a4, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t0, 0(t0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
+; RV32I-NEXT:    lbu t3, 0(t3)
+; RV32I-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sb t1, 58(sp)
+; RV32I-NEXT:    sb t2, 57(sp)
 ; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a4, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb t1, 59(sp)
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s10, 49(sp)
-; RV32I-NEXT:    sb s9, 48(sp)
-; RV32I-NEXT:    sb s8, 47(sp)
-; RV32I-NEXT:    sb s7, 46(sp)
-; RV32I-NEXT:    sb s6, 45(sp)
-; RV32I-NEXT:    sb s5, 44(sp)
-; RV32I-NEXT:    sb s4, 43(sp)
-; RV32I-NEXT:    sb s3, 42(sp)
-; RV32I-NEXT:    sb s2, 41(sp)
-; RV32I-NEXT:    sb s1, 40(sp)
-; RV32I-NEXT:    sb s0, 39(sp)
-; RV32I-NEXT:    sb t6, 38(sp)
-; RV32I-NEXT:    sb t5, 37(sp)
-; RV32I-NEXT:    sb t4, 36(sp)
-; RV32I-NEXT:    sb t3, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a1, 55(sp)
+; RV32I-NEXT:    sb a3, 54(sp)
+; RV32I-NEXT:    sb a4, 53(sp)
+; RV32I-NEXT:    sb t0, 59(sp)
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    sb a5, 52(sp)
+; RV32I-NEXT:    sb a6, 51(sp)
+; RV32I-NEXT:    sb a7, 50(sp)
+; RV32I-NEXT:    sb ra, 49(sp)
+; RV32I-NEXT:    sb s11, 48(sp)
+; RV32I-NEXT:    sb s10, 47(sp)
+; RV32I-NEXT:    sb s9, 46(sp)
+; RV32I-NEXT:    sb s8, 45(sp)
+; RV32I-NEXT:    sb s7, 44(sp)
+; RV32I-NEXT:    sb s6, 43(sp)
+; RV32I-NEXT:    sb s5, 42(sp)
+; RV32I-NEXT:    sb s4, 41(sp)
+; RV32I-NEXT:    sb s3, 40(sp)
+; RV32I-NEXT:    sb s2, 39(sp)
+; RV32I-NEXT:    sb s1, 38(sp)
+; RV32I-NEXT:    sb s0, 37(sp)
+; RV32I-NEXT:    sb t6, 36(sp)
+; RV32I-NEXT:    sb t5, 35(sp)
+; RV32I-NEXT:    sb t4, 34(sp)
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t1, 31
+; RV32I-NEXT:    srai a0, t0, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -2515,83 +2522,84 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 63(sp)
 ; RV32I-NEXT:    sb a3, 62(sp)
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    andi a0, t0, 31
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi a0, a0, 31
 ; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a6, a1, a0
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 31(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 30(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu a6, 27(a0)
+; RV32I-NEXT:    lbu a1, 23(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 24(a0)
+; RV32I-NEXT:    lbu t1, 26(a0)
+; RV32I-NEXT:    lbu t2, 25(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu t4, 21(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 19(a0)
+; RV32I-NEXT:    lbu a1, 15(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 16(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 17(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 13(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a1, 7(a0)
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s9, 8(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 9(a0)
+; RV32I-NEXT:    lbu ra, 6(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb s0, 29(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    lw a6, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    sb s3, 17(a2)
+; RV32I-NEXT:    sb s1, 16(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    sb s2, 18(a2)
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    sb t5, 20(a2)
+; RV32I-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    sb s8, 22(a2)
+; RV32I-NEXT:    sb s11, 9(a2)
+; RV32I-NEXT:    sb s9, 8(a2)
+; RV32I-NEXT:    sb s7, 11(a2)
+; RV32I-NEXT:    sb s10, 10(a2)
+; RV32I-NEXT:    sb s5, 13(a2)
+; RV32I-NEXT:    sb s6, 12(a2)
+; RV32I-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    sb s4, 14(a2)
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb ra, 6(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afaa6..c80c3e6834f6750 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -8,8 +8,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -37,17 +37,17 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -69,8 +69,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -98,17 +98,17 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -130,8 +130,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
@@ -159,17 +159,17 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -189,47 +189,47 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -262,17 +262,17 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB3_2
@@ -322,47 +322,47 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -395,17 +395,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a5, a1, a5
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB4_2
@@ -455,47 +455,47 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
 ; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t1
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
@@ -528,17 +528,17 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a4, a6, 24
 ; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 1(a1)
 ; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    lbu a5, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a5, a1, a6
 ; RV32I-NEXT:    addi a6, a5, -32
 ; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
@@ -589,47 +589,47 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
@@ -640,25 +640,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
 ; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu t4, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -710,36 +710,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu s2, 12(a0)
 ; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
+; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 39(sp)
 ; RV32I-NEXT:    sb zero, 38(sp)
 ; RV32I-NEXT:    sb zero, 37(sp)
@@ -752,115 +749,120 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb s4, 26(sp)
-; RV32I-NEXT:    sb s3, 25(sp)
-; RV32I-NEXT:    sb s2, 24(sp)
-; RV32I-NEXT:    sb t6, 23(sp)
-; RV32I-NEXT:    sb t5, 22(sp)
-; RV32I-NEXT:    sb t4, 21(sp)
-; RV32I-NEXT:    sb t3, 20(sp)
-; RV32I-NEXT:    sb t2, 19(sp)
-; RV32I-NEXT:    sb t1, 18(sp)
-; RV32I-NEXT:    sb t0, 17(sp)
-; RV32I-NEXT:    sb a7, 16(sp)
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    sb a5, 14(sp)
-; RV32I-NEXT:    sb a4, 13(sp)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 12
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    sb zero, 27(sp)
+; RV32I-NEXT:    sb zero, 26(sp)
+; RV32I-NEXT:    sb zero, 25(sp)
+; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    sb s5, 23(sp)
+; RV32I-NEXT:    sb s4, 22(sp)
+; RV32I-NEXT:    sb s3, 21(sp)
+; RV32I-NEXT:    sb s2, 20(sp)
+; RV32I-NEXT:    sb s1, 19(sp)
+; RV32I-NEXT:    sb s0, 18(sp)
+; RV32I-NEXT:    sb t6, 17(sp)
+; RV32I-NEXT:    sb t5, 16(sp)
+; RV32I-NEXT:    sb t4, 15(sp)
+; RV32I-NEXT:    sb t3, 14(sp)
+; RV32I-NEXT:    sb t2, 13(sp)
+; RV32I-NEXT:    sb t1, 12(sp)
+; RV32I-NEXT:    sb t0, 11(sp)
+; RV32I-NEXT:    sb a7, 10(sp)
+; RV32I-NEXT:    sb a6, 9(sp)
+; RV32I-NEXT:    sb a4, 8(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 8
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 4(a1)
+; RV32I-NEXT:    lbu a4, 5(a1)
+; RV32I-NEXT:    lbu a5, 6(a1)
+; RV32I-NEXT:    lbu a6, 7(a1)
+; RV32I-NEXT:    lbu a7, 8(a1)
+; RV32I-NEXT:    lbu t0, 9(a1)
+; RV32I-NEXT:    lbu t1, 10(a1)
+; RV32I-NEXT:    lbu t2, 11(a1)
+; RV32I-NEXT:    lbu t3, 12(a1)
+; RV32I-NEXT:    lbu t4, 13(a1)
+; RV32I-NEXT:    lbu t5, 14(a1)
+; RV32I-NEXT:    lbu t6, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu s0, 0(a1)
+; RV32I-NEXT:    lbu s1, 1(a1)
+; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl a4, a3, a0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    not a7, a0
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    srl a1, a1, a0
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    xori a7, a0, 31
+; RV32I-NEXT:    sll a3, a3, a7
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    sll a7, t1, a7
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    srl a0, t0, a0
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli t0, a5, 16
+; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a5, 14(a2)
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    sb a5, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    srli a0, a7, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -872,47 +874,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 1(a1)
-; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
 ; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a4
@@ -923,25 +925,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 9(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
 ; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli t4, t4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, t4
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
@@ -993,36 +995,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
+; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a6, 1(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu t0, 3(a0)
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t2, 5(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t5, 8(a0)
+; RV32I-NEXT:    lbu t6, 9(a0)
+; RV32I-NEXT:    lbu s0, 10(a0)
+; RV32I-NEXT:    lbu s1, 11(a0)
 ; RV32I-NEXT:    lbu s2, 12(a0)
 ; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
+; RV32I-NEXT:    or a0, a1, a5
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    sb zero, 23(sp)
 ; RV32I-NEXT:    sb zero, 22(sp)
 ; RV32I-NEXT:    sb zero, 21(sp)
@@ -1035,115 +1034,120 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 14(sp)
 ; RV32I-NEXT:    sb zero, 13(sp)
 ; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb a0, 43(sp)
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    sb t0, 33(sp)
-; RV32I-NEXT:    sb a7, 32(sp)
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    sb a5, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a3, 28(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 28
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    sll a0, a5, a4
-; RV32I-NEXT:    lbu a1, 1(a3)
-; RV32I-NEXT:    lbu a6, 0(a3)
-; RV32I-NEXT:    lbu a7, 2(a3)
-; RV32I-NEXT:    lbu t0, 3(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    xori a7, a4, 31
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu t0, 13(a3)
-; RV32I-NEXT:    lbu t1, 12(a3)
-; RV32I-NEXT:    lbu t2, 14(a3)
-; RV32I-NEXT:    lbu t3, 15(a3)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t0, t0, a4
-; RV32I-NEXT:    lbu t1, 9(a3)
+; RV32I-NEXT:    sb zero, 11(sp)
+; RV32I-NEXT:    sb zero, 10(sp)
+; RV32I-NEXT:    sb zero, 9(sp)
+; RV32I-NEXT:    sb zero, 8(sp)
+; RV32I-NEXT:    sb s5, 39(sp)
+; RV32I-NEXT:    sb s4, 38(sp)
+; RV32I-NEXT:    sb s3, 37(sp)
+; RV32I-NEXT:    sb s2, 36(sp)
+; RV32I-NEXT:    sb s1, 35(sp)
+; RV32I-NEXT:    sb s0, 34(sp)
+; RV32I-NEXT:    sb t6, 33(sp)
+; RV32I-NEXT:    sb t5, 32(sp)
+; RV32I-NEXT:    sb t4, 31(sp)
+; RV32I-NEXT:    sb t3, 30(sp)
+; RV32I-NEXT:    sb t2, 29(sp)
+; RV32I-NEXT:    sb t1, 28(sp)
+; RV32I-NEXT:    sb t0, 27(sp)
+; RV32I-NEXT:    sb a7, 26(sp)
+; RV32I-NEXT:    sb a6, 25(sp)
+; RV32I-NEXT:    sb a4, 24(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    addi a3, sp, 24
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lbu a1, 13(a3)
+; RV32I-NEXT:    lbu a4, 14(a3)
+; RV32I-NEXT:    lbu a5, 15(a3)
+; RV32I-NEXT:    lbu a6, 4(a3)
+; RV32I-NEXT:    lbu a7, 5(a3)
+; RV32I-NEXT:    lbu t0, 6(a3)
+; RV32I-NEXT:    lbu t1, 7(a3)
 ; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 10(a3)
-; RV32I-NEXT:    lbu a3, 11(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    lbu t3, 9(a3)
+; RV32I-NEXT:    lbu t4, 10(a3)
+; RV32I-NEXT:    lbu t5, 12(a3)
+; RV32I-NEXT:    lbu t6, 11(a3)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    lbu s0, 0(a3)
+; RV32I-NEXT:    lbu s1, 1(a3)
+; RV32I-NEXT:    lbu s2, 2(a3)
+; RV32I-NEXT:    lbu a3, 3(a3)
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    sll a7, a6, a0
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl a7, t1, a7
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a3, a3, a4
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    not t1, a4
+; RV32I-NEXT:    or a3, a3, s2
+; RV32I-NEXT:    or a3, a3, s0
+; RV32I-NEXT:    srli t0, a3, 1
+; RV32I-NEXT:    xori t1, a0, 31
+; RV32I-NEXT:    srl t0, t0, t1
+; RV32I-NEXT:    or t0, a7, t0
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    sll a1, a1, a0
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or a4, t3, t2
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t6, t4
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    srli a5, a4, 1
 ; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, a3, a5
-; RV32I-NEXT:    sll a4, a6, a4
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    srli a3, t0, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, t0, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    sll a4, a4, a0
+; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    not t1, a0
+; RV32I-NEXT:    srl a6, a6, t1
+; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    sll a0, a3, a0
+; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    sb a3, 10(a2)
 ; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a3, 11(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, a7, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb t0, 4(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1155,47 +1159,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
 ; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a5, 0(a1)
+; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
 ; RV64I-NEXT:    lbu t0, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t0, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a5, a1, a5
@@ -1208,25 +1212,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
 ; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    lbu t3, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, t3
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
@@ -1277,157 +1281,157 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 11(a0)
-; RV32I-NEXT:    lbu s4, 12(a0)
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 0(a1)
+; RV32I-NEXT:    lbu s3, 1(a1)
+; RV32I-NEXT:    lbu s4, 15(a0)
+; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu a0, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu a0, 14(a0)
-; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s3, s2
+; RV32I-NEXT:    slli s3, s4, 24
+; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s2
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sb a3, 23(sp)
-; RV32I-NEXT:    sb a0, 22(sp)
-; RV32I-NEXT:    sb s5, 21(sp)
-; RV32I-NEXT:    sb s4, 20(sp)
-; RV32I-NEXT:    sb s3, 19(sp)
-; RV32I-NEXT:    sb s0, 18(sp)
-; RV32I-NEXT:    sb t6, 17(sp)
-; RV32I-NEXT:    sb t5, 16(sp)
-; RV32I-NEXT:    sb t4, 15(sp)
-; RV32I-NEXT:    sb t3, 14(sp)
-; RV32I-NEXT:    sb t2, 13(sp)
-; RV32I-NEXT:    sb t1, 12(sp)
-; RV32I-NEXT:    sb t0, 11(sp)
-; RV32I-NEXT:    sb a7, 10(sp)
-; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 36(sp)
-; RV32I-NEXT:    sb a4, 32(sp)
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 39(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 38(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb a3, 34(sp)
-; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, s2
+; RV32I-NEXT:    sb s4, 23(sp)
+; RV32I-NEXT:    sb s5, 22(sp)
+; RV32I-NEXT:    sb s1, 21(sp)
+; RV32I-NEXT:    sb s0, 20(sp)
+; RV32I-NEXT:    sb t6, 19(sp)
+; RV32I-NEXT:    sb t5, 18(sp)
+; RV32I-NEXT:    sb t4, 17(sp)
+; RV32I-NEXT:    sb t3, 16(sp)
+; RV32I-NEXT:    sb t2, 15(sp)
+; RV32I-NEXT:    sb t1, 14(sp)
+; RV32I-NEXT:    sb t0, 13(sp)
+; RV32I-NEXT:    sb a7, 12(sp)
+; RV32I-NEXT:    sb a6, 11(sp)
+; RV32I-NEXT:    sb a5, 10(sp)
+; RV32I-NEXT:    sb a4, 9(sp)
+; RV32I-NEXT:    sb a3, 8(sp)
+; RV32I-NEXT:    srai a1, s3, 31
+; RV32I-NEXT:    sb a1, 36(sp)
+; RV32I-NEXT:    sb a1, 32(sp)
+; RV32I-NEXT:    sb a1, 28(sp)
+; RV32I-NEXT:    sb a1, 24(sp)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 39(sp)
+; RV32I-NEXT:    srli a4, a1, 16
+; RV32I-NEXT:    sb a4, 38(sp)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 37(sp)
+; RV32I-NEXT:    sb a3, 35(sp)
+; RV32I-NEXT:    sb a4, 34(sp)
+; RV32I-NEXT:    sb a1, 33(sp)
+; RV32I-NEXT:    sb a3, 31(sp)
+; RV32I-NEXT:    sb a4, 30(sp)
+; RV32I-NEXT:    sb a1, 29(sp)
+; RV32I-NEXT:    sb a3, 27(sp)
+; RV32I-NEXT:    sb a4, 26(sp)
+; RV32I-NEXT:    sb a1, 25(sp)
+; RV32I-NEXT:    slli a1, a0, 25
+; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    addi a3, sp, 8
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    lbu a3, 4(a1)
+; RV32I-NEXT:    lbu a4, 5(a1)
+; RV32I-NEXT:    lbu a5, 6(a1)
+; RV32I-NEXT:    lbu a6, 7(a1)
+; RV32I-NEXT:    lbu a7, 8(a1)
+; RV32I-NEXT:    lbu t0, 9(a1)
+; RV32I-NEXT:    lbu t1, 10(a1)
+; RV32I-NEXT:    lbu t2, 11(a1)
+; RV32I-NEXT:    lbu t3, 12(a1)
+; RV32I-NEXT:    lbu t4, 13(a1)
+; RV32I-NEXT:    lbu t5, 14(a1)
+; RV32I-NEXT:    lbu t6, 15(a1)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu s0, 0(a1)
+; RV32I-NEXT:    lbu s1, 1(a1)
+; RV32I-NEXT:    lbu s2, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    andi a0, a0, 7
+; RV32I-NEXT:    srl a4, a3, a0
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a5, t0, a7
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sra a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    not a7, a0
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a6, a4, a6
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s2
+; RV32I-NEXT:    or a1, a1, s0
+; RV32I-NEXT:    srl a1, a1, a0
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    xori a7, a0, 31
+; RV32I-NEXT:    sll a3, a3, a7
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    sll a7, t1, a7
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    sra a0, t0, a0
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli t0, a5, 16
+; RV32I-NEXT:    sb t0, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a5, 14(a2)
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    sb a5, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    srli a0, a7, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
@@ -1460,18 +1464,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu s2, 29(a0)
+; RV64I-NEXT:    lbu s4, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a5, a1, a3
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -1480,69 +1509,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    lbu s0, 12(a0)
 ; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu s11, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    lbu a6, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 87(sp)
-; RV64I-NEXT:    sb a3, 86(sp)
-; RV64I-NEXT:    sb a4, 85(sp)
+; RV64I-NEXT:    sb s6, 87(sp)
+; RV64I-NEXT:    sb s4, 86(sp)
+; RV64I-NEXT:    sb s2, 85(sp)
 ; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a5, 83(sp)
-; RV64I-NEXT:    sb a6, 82(sp)
-; RV64I-NEXT:    sb a7, 81(sp)
-; RV64I-NEXT:    sb s11, 80(sp)
-; RV64I-NEXT:    sb s10, 79(sp)
-; RV64I-NEXT:    sb ra, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
+; RV64I-NEXT:    sb a1, 83(sp)
+; RV64I-NEXT:    sb a3, 82(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1575,6 +1562,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 90(sp)
 ; RV64I-NEXT:    sb zero, 89(sp)
 ; RV64I-NEXT:    sb zero, 88(sp)
+; RV64I-NEXT:    sb a4, 81(sp)
+; RV64I-NEXT:    sb a6, 80(sp)
+; RV64I-NEXT:    sb a7, 79(sp)
+; RV64I-NEXT:    sb t0, 78(sp)
+; RV64I-NEXT:    sb ra, 77(sp)
+; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    sb s10, 75(sp)
+; RV64I-NEXT:    sb s9, 74(sp)
+; RV64I-NEXT:    sb s8, 73(sp)
+; RV64I-NEXT:    sb s7, 72(sp)
+; RV64I-NEXT:    sb s5, 71(sp)
+; RV64I-NEXT:    sb s3, 70(sp)
+; RV64I-NEXT:    sb s1, 69(sp)
+; RV64I-NEXT:    sb s0, 68(sp)
+; RV64I-NEXT:    sb t6, 67(sp)
+; RV64I-NEXT:    sb t5, 66(sp)
+; RV64I-NEXT:    sb t4, 65(sp)
 ; RV64I-NEXT:    sb t3, 64(sp)
 ; RV64I-NEXT:    sb t2, 63(sp)
 ; RV64I-NEXT:    sb t1, 62(sp)
@@ -1590,111 +1594,112 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 57(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    slli a0, a5, 56
+; RV64I-NEXT:    mv ra, a5
 ; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a3, sp, 56
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    lbu a0, 9(a3)
-; RV64I-NEXT:    lbu a1, 8(a3)
-; RV64I-NEXT:    lbu a4, 10(a3)
-; RV64I-NEXT:    lbu a5, 11(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a1, 13(a3)
-; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a5, 14(a3)
-; RV64I-NEXT:    lbu a6, 15(a3)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    addi a1, sp, 56
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu t3, 16(a0)
+; RV64I-NEXT:    lbu t4, 17(a0)
+; RV64I-NEXT:    lbu t5, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu s0, 20(a0)
+; RV64I-NEXT:    lbu s1, 21(a0)
+; RV64I-NEXT:    lbu a1, 22(a0)
+; RV64I-NEXT:    lbu s2, 23(a0)
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s4, 25(a0)
+; RV64I-NEXT:    lbu s5, 26(a0)
+; RV64I-NEXT:    lbu s6, 27(a0)
+; RV64I-NEXT:    lbu s7, 28(a0)
+; RV64I-NEXT:    lbu s8, 29(a0)
+; RV64I-NEXT:    lbu s9, 30(a0)
+; RV64I-NEXT:    lbu s10, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    lbu s11, 7(a0)
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a4, a1, a0
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a0, 17(a3)
-; RV64I-NEXT:    lbu a5, 16(a3)
-; RV64I-NEXT:    lbu a6, 18(a3)
-; RV64I-NEXT:    lbu a7, 19(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a3)
-; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu a7, 22(a3)
-; RV64I-NEXT:    lbu t0, 23(a3)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    andi a3, ra, 7
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or a5, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a6, t6, t5
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a1
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a3)
-; RV64I-NEXT:    lbu a7, 0(a3)
-; RV64I-NEXT:    lbu t0, 2(a3)
-; RV64I-NEXT:    lbu t1, 3(a3)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    or a1, s2, a1
+; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    slli a1, a5, 1
+; RV64I-NEXT:    not a6, a3
+; RV64I-NEXT:    sll a1, a1, a6
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a3)
-; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t1, 6(a3)
-; RV64I-NEXT:    lbu t2, 7(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a3)
-; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t1, 26(a3)
-; RV64I-NEXT:    lbu t2, 27(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    lbu t1, 28(a3)
-; RV64I-NEXT:    lbu t2, 30(a3)
-; RV64I-NEXT:    lbu a3, 31(a3)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a3, t2
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or a0, s11, a0
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a6, a0, a6
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s4, s3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or a7, s6, s5
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or a7, s8, s7
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or t0, s10, s9
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a3, a3, t0
-; RV64I-NEXT:    xori t0, a1, 63
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    xori t0, a3, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a7, a3, a7
-; RV64I-NEXT:    slli a3, a7, 1
-; RV64I-NEXT:    sll t0, a3, t0
-; RV64I-NEXT:    srl a3, a4, a1
-; RV64I-NEXT:    srl a4, a6, a1
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    srl a1, a7, a1
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a0
+; RV64I-NEXT:    slli a0, a7, 1
+; RV64I-NEXT:    sll t0, a0, t0
+; RV64I-NEXT:    srl a0, a4, a3
+; RV64I-NEXT:    srl a4, a6, a3
+; RV64I-NEXT:    srl a5, a5, a3
+; RV64I-NEXT:    srl a3, a7, a3
 ; RV64I-NEXT:    srli a6, a5, 48
 ; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    srli a6, a5, 40
@@ -1709,55 +1714,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    srli a5, a3, 56
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    srli a5, a3, 40
 ; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    srli a5, a3, 32
 ; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    srli a5, a3, 24
 ; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    srli a5, a3, 16
 ; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 25(a2)
-; RV64I-NEXT:    srli a1, a4, 48
-; RV64I-NEXT:    sb a1, 6(a2)
-; RV64I-NEXT:    srli a1, a4, 40
-; RV64I-NEXT:    sb a1, 5(a2)
-; RV64I-NEXT:    srli a1, a4, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    srli a1, a4, 24
-; RV64I-NEXT:    sb a1, 3(a2)
-; RV64I-NEXT:    srli a1, a4, 16
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    or a1, a4, t1
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    or a3, a4, t1
 ; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a3, 32
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a3, 24
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a3, 16
+; RV64I-NEXT:    srli a4, a0, 16
 ; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    srli a3, a6, 56
-; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 7(a2)
 ; RV64I-NEXT:    srli a1, a1, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
+; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -1790,18 +1795,32 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a4, 31(a0)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a1, a5
+; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -1816,44 +1835,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s5, 17(a0)
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 59(sp)
-; RV32I-NEXT:    sb a3, 58(sp)
-; RV32I-NEXT:    sb a4, 57(sp)
+; RV32I-NEXT:    sb a4, 59(sp)
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb ra, 57(sp)
 ; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a5, 55(sp)
-; RV32I-NEXT:    sb a6, 54(sp)
-; RV32I-NEXT:    sb a7, 53(sp)
-; RV32I-NEXT:    sb s10, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
+; RV32I-NEXT:    sb a1, 55(sp)
+; RV32I-NEXT:    sb a3, 54(sp)
 ; RV32I-NEXT:    sb zero, 91(sp)
 ; RV32I-NEXT:    sb zero, 90(sp)
 ; RV32I-NEXT:    sb zero, 89(sp)
@@ -1886,6 +1882,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 62(sp)
 ; RV32I-NEXT:    sb zero, 61(sp)
 ; RV32I-NEXT:    sb zero, 60(sp)
+; RV32I-NEXT:    sb a5, 53(sp)
+; RV32I-NEXT:    sb a7, 52(sp)
+; RV32I-NEXT:    sb s11, 51(sp)
+; RV32I-NEXT:    sb s10, 50(sp)
+; RV32I-NEXT:    sb s9, 49(sp)
+; RV32I-NEXT:    sb s8, 48(sp)
+; RV32I-NEXT:    sb s7, 47(sp)
+; RV32I-NEXT:    sb s6, 46(sp)
+; RV32I-NEXT:    sb s5, 45(sp)
+; RV32I-NEXT:    sb s4, 44(sp)
 ; RV32I-NEXT:    sb s3, 43(sp)
 ; RV32I-NEXT:    sb s2, 42(sp)
 ; RV32I-NEXT:    sb s1, 41(sp)
@@ -1896,187 +1902,194 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t3, 36(sp)
 ; RV32I-NEXT:    sb t2, 35(sp)
 ; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    slli a0, a6, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t0, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 8(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 9(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu t0, 20(a0)
+; RV32I-NEXT:    lbu s5, 21(a0)
+; RV32I-NEXT:    lbu s6, 22(a0)
+; RV32I-NEXT:    lbu s7, 23(a0)
+; RV32I-NEXT:    lbu s4, 24(a0)
+; RV32I-NEXT:    lbu s8, 25(a0)
+; RV32I-NEXT:    lbu s9, 26(a0)
+; RV32I-NEXT:    lbu s10, 27(a0)
+; RV32I-NEXT:    lbu s11, 28(a0)
+; RV32I-NEXT:    lbu a5, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, a7, 16
+; RV32I-NEXT:    lw a7, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a1, a4
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    lw a7, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t1, 1(a0)
+; RV32I-NEXT:    lbu t2, 0(a0)
+; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    or a7, a7, a1
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a1, t1, t2
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or a3, t6, t5
+; RV32I-NEXT:    or t1, a3, a0
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a0, s3, s2
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or a0, s5, t0
+; RV32I-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t2, a3, 7
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or a3, s7, s6
+; RV32I-NEXT:    slli t0, a7, 1
+; RV32I-NEXT:    or t3, a3, a0
+; RV32I-NEXT:    not t4, t2
+; RV32I-NEXT:    sll a0, t0, t4
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or t0, s8, s4
+; RV32I-NEXT:    slli t5, a4, 1
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or t6, s10, s9
+; RV32I-NEXT:    slli a3, s0, 1
+; RV32I-NEXT:    sll a3, a3, t4
+; RV32I-NEXT:    or t6, t6, t0
+; RV32I-NEXT:    slli t0, t6, 1
+; RV32I-NEXT:    sll t4, t0, t4
+; RV32I-NEXT:    xori s1, t2, 31
+; RV32I-NEXT:    sll t0, t5, s1
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
+; RV32I-NEXT:    or a5, a5, s11
+; RV32I-NEXT:    slli t5, t1, 1
+; RV32I-NEXT:    sll t5, t5, s1
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a6, a6, ra
 ; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    srl a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    sll s2, s2, s1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    sll a6, a6, s1
+; RV32I-NEXT:    srl a4, a4, t2
+; RV32I-NEXT:    srl a1, a1, t2
+; RV32I-NEXT:    srl t1, t1, t2
+; RV32I-NEXT:    srl a7, a7, t2
+; RV32I-NEXT:    srl t3, t3, t2
+; RV32I-NEXT:    srl s0, s0, t2
+; RV32I-NEXT:    srl t6, t6, t2
+; RV32I-NEXT:    srl a5, a5, t2
+; RV32I-NEXT:    srli t2, t6, 16
+; RV32I-NEXT:    sb t2, 26(a2)
+; RV32I-NEXT:    or a6, t6, a6
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    srli t2, t6, 8
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    sb t2, 31(a2)
+; RV32I-NEXT:    srli t2, a5, 16
+; RV32I-NEXT:    sb t2, 30(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, s0, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    or a5, s0, s2
+; RV32I-NEXT:    sb s0, 16(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 17(a2)
+; RV32I-NEXT:    srli t2, t3, 16
+; RV32I-NEXT:    sb t2, 22(a2)
+; RV32I-NEXT:    or t2, t3, t4
 ; RV32I-NEXT:    sb t3, 20(a2)
 ; RV32I-NEXT:    srli t3, t3, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
+; RV32I-NEXT:    srli t3, a7, 16
 ; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
+; RV32I-NEXT:    or t3, a7, t5
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    srli a7, a7, 8
+; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    srli a7, t1, 16
+; RV32I-NEXT:    sb a7, 14(a2)
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    srli a7, t1, 8
 ; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
+; RV32I-NEXT:    srli a7, a1, 16
 ; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    or a7, a1, t0
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a1, t2, 24
+; RV32I-NEXT:    sb a1, 23(a2)
+; RV32I-NEXT:    srli a1, t3, 24
+; RV32I-NEXT:    sb a1, 11(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
 ; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
+; RV32I-NEXT:    srli a1, a7, 24
 ; RV32I-NEXT:    sb a1, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
@@ -2118,18 +2131,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu s2, 29(a0)
+; RV64I-NEXT:    lbu s4, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    lbu a3, 0(a1)
+; RV64I-NEXT:    lbu a4, 1(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or s11, a1, a3
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 3(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 5(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu t2, 7(a0)
 ; RV64I-NEXT:    lbu t3, 8(a0)
@@ -2138,70 +2176,28 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu t6, 11(a0)
 ; RV64I-NEXT:    lbu s0, 12(a0)
 ; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu s3, 14(a0)
+; RV64I-NEXT:    lbu s5, 15(a0)
+; RV64I-NEXT:    lbu s7, 16(a0)
+; RV64I-NEXT:    lbu s8, 17(a0)
+; RV64I-NEXT:    lbu s9, 18(a0)
+; RV64I-NEXT:    lbu s10, 19(a0)
+; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu a6, 23(a0)
+; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    lbu a4, 25(a0)
+; RV64I-NEXT:    lbu a3, 26(a0)
+; RV64I-NEXT:    lbu a1, 27(a0)
 ; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    sb a4, 117(sp)
+; RV64I-NEXT:    sb s6, 119(sp)
+; RV64I-NEXT:    sb s4, 118(sp)
+; RV64I-NEXT:    sb s2, 117(sp)
 ; RV64I-NEXT:    sb a0, 116(sp)
-; RV64I-NEXT:    sb a5, 115(sp)
-; RV64I-NEXT:    sb a6, 114(sp)
-; RV64I-NEXT:    sb a7, 113(sp)
-; RV64I-NEXT:    sb s11, 112(sp)
-; RV64I-NEXT:    sb s10, 111(sp)
-; RV64I-NEXT:    sb ra, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
+; RV64I-NEXT:    sb a1, 115(sp)
+; RV64I-NEXT:    sb a3, 114(sp)
+; RV64I-NEXT:    sb a4, 113(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -2234,6 +2230,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 58(sp)
 ; RV64I-NEXT:    sb zero, 57(sp)
 ; RV64I-NEXT:    sb zero, 56(sp)
+; RV64I-NEXT:    sb a5, 112(sp)
+; RV64I-NEXT:    sb a6, 111(sp)
+; RV64I-NEXT:    sb a7, 110(sp)
+; RV64I-NEXT:    sb ra, 109(sp)
+; RV64I-NEXT:    sb t0, 108(sp)
+; RV64I-NEXT:    sb s10, 107(sp)
+; RV64I-NEXT:    sb s9, 106(sp)
+; RV64I-NEXT:    sb s8, 105(sp)
+; RV64I-NEXT:    sb s7, 104(sp)
+; RV64I-NEXT:    sb s5, 103(sp)
+; RV64I-NEXT:    sb s3, 102(sp)
+; RV64I-NEXT:    sb s1, 101(sp)
+; RV64I-NEXT:    sb s0, 100(sp)
+; RV64I-NEXT:    sb t6, 99(sp)
+; RV64I-NEXT:    sb t5, 98(sp)
+; RV64I-NEXT:    sb t4, 97(sp)
+; RV64I-NEXT:    sb t3, 96(sp)
 ; RV64I-NEXT:    sb t2, 95(sp)
 ; RV64I-NEXT:    sb t1, 94(sp)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
@@ -2248,173 +2261,173 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a0, 89(sp)
 ; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    slli a0, t0, 56
+; RV64I-NEXT:    slli a0, s11, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 88
 ; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    lbu a3, 8(a0)
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a5, 11(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a4, 26(a0)
+; RV64I-NEXT:    lbu a5, 27(a0)
+; RV64I-NEXT:    lbu a6, 28(a0)
+; RV64I-NEXT:    lbu a1, 29(a0)
+; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    lbu t0, 31(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s3, 9(a0)
+; RV64I-NEXT:    lbu s4, 10(a0)
+; RV64I-NEXT:    lbu s5, 11(a0)
+; RV64I-NEXT:    lbu s6, 12(a0)
+; RV64I-NEXT:    lbu s7, 13(a0)
+; RV64I-NEXT:    lbu s8, 14(a0)
+; RV64I-NEXT:    lbu s9, 15(a0)
+; RV64I-NEXT:    lbu t1, 16(a0)
+; RV64I-NEXT:    lbu t2, 17(a0)
+; RV64I-NEXT:    lbu t3, 18(a0)
+; RV64I-NEXT:    lbu t5, 19(a0)
+; RV64I-NEXT:    lbu t4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s5, s5, 24
+; RV64I-NEXT:    or s3, s5, s4
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    or s3, s7, s6
+; RV64I-NEXT:    slli s8, s8, 16
+; RV64I-NEXT:    slli s9, s9, 24
+; RV64I-NEXT:    or s4, s9, s8
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli s3, s3, 32
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 0(a0)
+; RV64I-NEXT:    lbu s4, 1(a0)
+; RV64I-NEXT:    lbu s5, 2(a0)
+; RV64I-NEXT:    lbu s6, 3(a0)
+; RV64I-NEXT:    lbu s7, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    or s4, s8, s7
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or s5, s10, s9
+; RV64I-NEXT:    or s4, s5, s4
+; RV64I-NEXT:    lbu a0, 24(a0)
+; RV64I-NEXT:    slli s4, s4, 32
+; RV64I-NEXT:    or s3, s4, s3
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a5, a5, 24
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a3, 13(a0)
-; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu a6, 15(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a3, a1
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a1, a0
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a0, t2, t1
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    or a1, t5, t3
+; RV64I-NEXT:    or a1, a1, a0
+; RV64I-NEXT:    andi a4, s11, 7
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    or a5, t6, t4
+; RV64I-NEXT:    srli a0, s3, 1
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    xori a6, a4, 63
+; RV64I-NEXT:    srl a0, a0, a6
+; RV64I-NEXT:    or a5, s0, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t0, 27(a0)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t0, 30(a0)
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t0, 18(a0)
-; RV64I-NEXT:    lbu t1, 19(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    lbu t0, 20(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    srli t0, a4, 1
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or t1, a0, t1
-; RV64I-NEXT:    xori t2, a1, 63
-; RV64I-NEXT:    srl a0, t0, t2
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    srli a7, a6, 1
-; RV64I-NEXT:    srl a7, a7, t2
-; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    not t1, a1
-; RV64I-NEXT:    srl t0, t0, t1
-; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    sll a5, a5, a1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    sll a1, a4, a1
-; RV64I-NEXT:    srli a4, a6, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a6, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a6, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a6, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a6, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    or a4, a6, t0
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 30(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 27(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    or a6, a5, a7
+; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    srli a1, a5, 1
+; RV64I-NEXT:    srl a6, a1, a6
+; RV64I-NEXT:    srli a1, s2, 1
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    srl a7, a1, a7
+; RV64I-NEXT:    sll a1, s2, a4
+; RV64I-NEXT:    sll a3, a3, a4
+; RV64I-NEXT:    sll a5, a5, a4
+; RV64I-NEXT:    sll a4, s3, a4
+; RV64I-NEXT:    srli t0, a5, 56
+; RV64I-NEXT:    sb t0, 23(a2)
+; RV64I-NEXT:    srli t0, a5, 48
+; RV64I-NEXT:    sb t0, 22(a2)
+; RV64I-NEXT:    srli t0, a5, 40
+; RV64I-NEXT:    sb t0, 21(a2)
+; RV64I-NEXT:    srli t0, a5, 32
+; RV64I-NEXT:    sb t0, 20(a2)
+; RV64I-NEXT:    srli t0, a5, 24
+; RV64I-NEXT:    sb t0, 19(a2)
+; RV64I-NEXT:    srli t0, a5, 16
+; RV64I-NEXT:    sb t0, 18(a2)
+; RV64I-NEXT:    or a7, a5, a7
 ; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 6(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 2(a2)
-; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 1(a2)
-; RV64I-NEXT:    srli a1, a3, 56
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a3, 48
-; RV64I-NEXT:    sb a1, 14(a2)
-; RV64I-NEXT:    srli a1, a3, 40
-; RV64I-NEXT:    sb a1, 13(a2)
-; RV64I-NEXT:    srli a1, a3, 32
-; RV64I-NEXT:    sb a1, 12(a2)
-; RV64I-NEXT:    srli a1, a3, 24
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    srli a1, a3, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    or a5, a3, a6
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a6, 24(a2)
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, a4, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a7, 16(a2)
+; RV64I-NEXT:    sb a5, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -2448,18 +2461,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu ra, 29(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu a4, 31(a0)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a1, a5
+; RV32I-NEXT:    sw a6, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 5(a0)
+; RV32I-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    lbu t3, 8(a0)
@@ -2474,44 +2501,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s5, 17(a0)
 ; RV32I-NEXT:    lbu s6, 18(a0)
 ; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
 ; RV32I-NEXT:    lbu s8, 20(a0)
 ; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu a3, 26(a0)
+; RV32I-NEXT:    lbu a1, 27(a0)
 ; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    sb a4, 89(sp)
+; RV32I-NEXT:    sb a4, 91(sp)
+; RV32I-NEXT:    sb t0, 90(sp)
+; RV32I-NEXT:    sb ra, 89(sp)
 ; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a5, 87(sp)
-; RV32I-NEXT:    sb a6, 86(sp)
-; RV32I-NEXT:    sb a7, 85(sp)
-; RV32I-NEXT:    sb s10, 84(sp)
-; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s11, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
+; RV32I-NEXT:    sb a1, 87(sp)
+; RV32I-NEXT:    sb a3, 86(sp)
 ; RV32I-NEXT:    sb zero, 59(sp)
 ; RV32I-NEXT:    sb zero, 58(sp)
 ; RV32I-NEXT:    sb zero, 57(sp)
@@ -2544,6 +2548,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb zero, 30(sp)
 ; RV32I-NEXT:    sb zero, 29(sp)
 ; RV32I-NEXT:    sb zero, 28(sp)
+; RV32I-NEXT:    sb a5, 85(sp)
+; RV32I-NEXT:    sb a7, 84(sp)
+; RV32I-NEXT:    sb s11, 83(sp)
+; RV32I-NEXT:    sb s10, 82(sp)
+; RV32I-NEXT:    sb s9, 81(sp)
+; RV32I-NEXT:    sb s8, 80(sp)
+; RV32I-NEXT:    sb s7, 79(sp)
+; RV32I-NEXT:    sb s6, 78(sp)
+; RV32I-NEXT:    sb s5, 77(sp)
+; RV32I-NEXT:    sb s4, 76(sp)
 ; RV32I-NEXT:    sb s3, 75(sp)
 ; RV32I-NEXT:    sb s2, 74(sp)
 ; RV32I-NEXT:    sb s1, 73(sp)
@@ -2554,189 +2568,192 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb t3, 68(sp)
 ; RV32I-NEXT:    sb t2, 67(sp)
 ; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, t0, 24
+; RV32I-NEXT:    slli a0, a6, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 60
-; RV32I-NEXT:    sub a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a1, t0, 7
-; RV32I-NEXT:    lbu a0, 1(a4)
-; RV32I-NEXT:    lbu a3, 0(a4)
-; RV32I-NEXT:    lbu a5, 2(a4)
-; RV32I-NEXT:    lbu a6, 3(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    addi a1, sp, 60
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    lbu a3, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    lbu a1, 8(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 9(a0)
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
+; RV32I-NEXT:    lbu a7, 16(a0)
+; RV32I-NEXT:    lbu t6, 17(a0)
+; RV32I-NEXT:    lbu t5, 18(a0)
+; RV32I-NEXT:    lbu s1, 29(a0)
+; RV32I-NEXT:    lbu s0, 30(a0)
+; RV32I-NEXT:    lbu s2, 31(a0)
+; RV32I-NEXT:    lbu s7, 21(a0)
+; RV32I-NEXT:    lbu s8, 22(a0)
+; RV32I-NEXT:    lbu s9, 23(a0)
+; RV32I-NEXT:    lbu s3, 24(a0)
+; RV32I-NEXT:    lbu s5, 25(a0)
+; RV32I-NEXT:    lbu s4, 26(a0)
+; RV32I-NEXT:    lbu s6, 27(a0)
+; RV32I-NEXT:    lbu s10, 19(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu ra, 3(a0)
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a6, a5
-; RV32I-NEXT:    or a6, a3, a0
-; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori a7, a1, 31
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    lbu a3, 13(a4)
-; RV32I-NEXT:    lbu a5, 12(a4)
-; RV32I-NEXT:    lbu t0, 14(a4)
-; RV32I-NEXT:    lbu t1, 15(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t1, t0
-; RV32I-NEXT:    or t0, a5, a3
-; RV32I-NEXT:    lbu a3, 9(a4)
-; RV32I-NEXT:    lbu a5, 8(a4)
-; RV32I-NEXT:    lbu t1, 10(a4)
-; RV32I-NEXT:    lbu t2, 11(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    or a1, ra, a1
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a1, t4, a5
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or a5, t3, t2
+; RV32I-NEXT:    or a6, a5, a1
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    lbu a5, 20(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, t2, t1
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    srli a3, t1, 1
-; RV32I-NEXT:    srl a5, a3, a7
-; RV32I-NEXT:    srli t4, t5, 1
-; RV32I-NEXT:    not t2, a1
-; RV32I-NEXT:    lbu a3, 21(a4)
-; RV32I-NEXT:    lbu t3, 20(a4)
-; RV32I-NEXT:    lbu t6, 22(a4)
-; RV32I-NEXT:    lbu s0, 23(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t3, t3, a3
-; RV32I-NEXT:    lbu a3, 17(a4)
-; RV32I-NEXT:    lbu t6, 16(a4)
-; RV32I-NEXT:    lbu s0, 18(a4)
-; RV32I-NEXT:    lbu s1, 19(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or t0, s11, t1
+; RV32I-NEXT:    or t0, t0, a1
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or a1, s7, a5
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    or a5, s9, s8
+; RV32I-NEXT:    or t1, a5, a1
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or a1, t6, a7
+; RV32I-NEXT:    lbu a5, 28(a0)
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi a7, a0, 7
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or a0, s10, t5
+; RV32I-NEXT:    srli t2, a3, 1
+; RV32I-NEXT:    or t3, a0, a1
+; RV32I-NEXT:    xori t4, a7, 31
+; RV32I-NEXT:    srl a0, t2, t4
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or a5, s1, a5
+; RV32I-NEXT:    srli a1, t0, 1
+; RV32I-NEXT:    srl a1, a1, t4
 ; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a3
-; RV32I-NEXT:    lbu a3, 29(a4)
-; RV32I-NEXT:    lbu t6, 28(a4)
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu s2, 31(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    lbu s1, 25(a4)
-; RV32I-NEXT:    lbu s2, 24(a4)
-; RV32I-NEXT:    srl t4, t4, t2
-; RV32I-NEXT:    or t6, t6, a3
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s1, s2
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu a4, 27(a4)
-; RV32I-NEXT:    srli s2, s0, 1
-; RV32I-NEXT:    srl s2, s2, a7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    srl s1, s1, t2
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    srl a7, a3, a7
-; RV32I-NEXT:    srli a3, t3, 1
-; RV32I-NEXT:    srl t2, a3, t2
-; RV32I-NEXT:    sll a3, t5, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t3, t3, a1
-; RV32I-NEXT:    sll t5, s0, a1
-; RV32I-NEXT:    sll t6, t6, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll a1, a6, a1
-; RV32I-NEXT:    srli a6, a4, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a4, t2
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t6, 24
-; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t6, 16
-; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    or a4, t6, a7
-; RV32I-NEXT:    srli a7, t6, 8
-; RV32I-NEXT:    sb a7, 29(a2)
+; RV32I-NEXT:    or t2, s2, s0
+; RV32I-NEXT:    srli t5, a4, 1
+; RV32I-NEXT:    or t2, t2, a5
+; RV32I-NEXT:    not t6, a7
+; RV32I-NEXT:    srl a5, t5, t6
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t5, s5, s3
+; RV32I-NEXT:    srli s0, t3, 1
+; RV32I-NEXT:    srl s0, s0, t4
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or s1, s6, s4
+; RV32I-NEXT:    srli s2, a6, 1
+; RV32I-NEXT:    srl s2, s2, t6
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    srli s1, t5, 1
+; RV32I-NEXT:    srl t4, s1, t4
+; RV32I-NEXT:    srli s1, t1, 1
+; RV32I-NEXT:    srl t6, s1, t6
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    sll t0, t0, a7
+; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    sll t2, t2, a7
+; RV32I-NEXT:    sll t5, t5, a7
+; RV32I-NEXT:    sll a3, a3, a7
 ; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 19(a2)
+; RV32I-NEXT:    sb a7, 27(a2)
 ; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    or a7, t5, s1
-; RV32I-NEXT:    srli t2, t5, 8
-; RV32I-NEXT:    sb t2, 17(a2)
+; RV32I-NEXT:    sb a7, 26(a2)
+; RV32I-NEXT:    or a7, t5, t6
+; RV32I-NEXT:    srli t5, t5, 8
+; RV32I-NEXT:    sb t5, 25(a2)
+; RV32I-NEXT:    srli t5, t2, 24
+; RV32I-NEXT:    sb t5, 31(a2)
+; RV32I-NEXT:    srli t5, t2, 16
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    or t4, t2, t4
+; RV32I-NEXT:    srli t2, t2, 8
+; RV32I-NEXT:    sb t2, 29(a2)
 ; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 23(a2)
+; RV32I-NEXT:    sb t2, 19(a2)
 ; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 22(a2)
+; RV32I-NEXT:    sb t2, 18(a2)
 ; RV32I-NEXT:    or t2, t3, s2
 ; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
+; RV32I-NEXT:    sb t3, 17(a2)
 ; RV32I-NEXT:    srli t3, t1, 24
-; RV32I-NEXT:    sb t3, 11(a2)
+; RV32I-NEXT:    sb t3, 23(a2)
 ; RV32I-NEXT:    srli t3, t1, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, t1, t4
+; RV32I-NEXT:    sb t3, 22(a2)
+; RV32I-NEXT:    or t3, t1, s0
 ; RV32I-NEXT:    srli t1, t1, 8
-; RV32I-NEXT:    sb t1, 9(a2)
+; RV32I-NEXT:    sb t1, 21(a2)
 ; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    sb t1, 11(a2)
 ; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    sb t1, 14(a2)
+; RV32I-NEXT:    sb t1, 10(a2)
 ; RV32I-NEXT:    or a5, t0, a5
 ; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    srli t0, a1, 16
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    srli t0, a6, 24
+; RV32I-NEXT:    sb t0, 15(a2)
+; RV32I-NEXT:    srli t0, a6, 16
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 13(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
 ; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t2, 20(a2)
-; RV32I-NEXT:    sb t3, 8(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t4, 28(a2)
+; RV32I-NEXT:    sb t2, 16(a2)
+; RV32I-NEXT:    sb t3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
@@ -2776,106 +2793,107 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a3, 28(a0)
 ; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    lbu a3, 29(a0)
 ; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
+; RV64I-NEXT:    lbu t6, 31(a0)
+; RV64I-NEXT:    lbu a3, 30(a0)
 ; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t3, 6(a0)
-; RV64I-NEXT:    lbu t4, 7(a0)
-; RV64I-NEXT:    lbu t5, 8(a0)
-; RV64I-NEXT:    lbu t6, 9(a0)
-; RV64I-NEXT:    lbu s0, 10(a0)
-; RV64I-NEXT:    lbu s1, 11(a0)
-; RV64I-NEXT:    lbu s2, 12(a0)
-; RV64I-NEXT:    lbu s3, 13(a0)
-; RV64I-NEXT:    lbu s4, 14(a0)
-; RV64I-NEXT:    lbu s5, 15(a0)
-; RV64I-NEXT:    lbu s6, 16(a0)
-; RV64I-NEXT:    lbu s7, 17(a0)
-; RV64I-NEXT:    lbu s8, 18(a0)
-; RV64I-NEXT:    lbu s9, 19(a0)
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or a3, s11, a3
-; RV64I-NEXT:    lbu s11, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu t0, 1(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    lbu t3, 4(a1)
+; RV64I-NEXT:    lbu t4, 5(a1)
+; RV64I-NEXT:    lbu t5, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s11
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    slli ra, ra, 16
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    or a1, a1, t5
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t2, a1, a3
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a4, 28(a0)
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 0(a0)
+; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a1, 2(a0)
+; RV64I-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t2, 3(a0)
+; RV64I-NEXT:    lbu t3, 4(a0)
+; RV64I-NEXT:    lbu t4, 5(a0)
+; RV64I-NEXT:    lbu t5, 6(a0)
+; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    lbu s0, 8(a0)
+; RV64I-NEXT:    lbu s1, 9(a0)
+; RV64I-NEXT:    lbu s2, 10(a0)
+; RV64I-NEXT:    lbu s3, 11(a0)
+; RV64I-NEXT:    lbu s4, 12(a0)
+; RV64I-NEXT:    lbu s5, 13(a0)
+; RV64I-NEXT:    lbu s6, 14(a0)
+; RV64I-NEXT:    lbu s7, 15(a0)
+; RV64I-NEXT:    lbu s8, 16(a0)
+; RV64I-NEXT:    lbu s9, 17(a0)
+; RV64I-NEXT:    lbu s10, 18(a0)
+; RV64I-NEXT:    lbu s11, 19(a0)
+; RV64I-NEXT:    lbu ra, 20(a0)
+; RV64I-NEXT:    lbu a7, 21(a0)
+; RV64I-NEXT:    lbu a6, 22(a0)
+; RV64I-NEXT:    lbu a5, 23(a0)
+; RV64I-NEXT:    lbu a4, 24(a0)
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a1, 26(a0)
 ; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a4, 84(sp)
+; RV64I-NEXT:    ld t1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t1, 86(sp)
+; RV64I-NEXT:    ld t1, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t1, 85(sp)
+; RV64I-NEXT:    ld t1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb t1, 84(sp)
 ; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb t0, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
-; RV64I-NEXT:    sb s9, 75(sp)
-; RV64I-NEXT:    sb s8, 74(sp)
-; RV64I-NEXT:    sb s7, 73(sp)
-; RV64I-NEXT:    sb s6, 72(sp)
-; RV64I-NEXT:    sb s5, 71(sp)
-; RV64I-NEXT:    sb s4, 70(sp)
-; RV64I-NEXT:    sb s3, 69(sp)
-; RV64I-NEXT:    sb s2, 68(sp)
-; RV64I-NEXT:    sb s1, 67(sp)
-; RV64I-NEXT:    sb s0, 66(sp)
-; RV64I-NEXT:    sb t6, 65(sp)
-; RV64I-NEXT:    sb t5, 64(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb t4, 63(sp)
-; RV64I-NEXT:    sb t3, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    sb a1, 82(sp)
+; RV64I-NEXT:    sb a3, 81(sp)
+; RV64I-NEXT:    sb t6, 87(sp)
+; RV64I-NEXT:    slli t6, t6, 56
+; RV64I-NEXT:    sb a4, 80(sp)
+; RV64I-NEXT:    sb a5, 79(sp)
+; RV64I-NEXT:    sb a6, 78(sp)
+; RV64I-NEXT:    sb a7, 77(sp)
+; RV64I-NEXT:    sb ra, 76(sp)
+; RV64I-NEXT:    sb s11, 75(sp)
+; RV64I-NEXT:    sb s10, 74(sp)
+; RV64I-NEXT:    sb s9, 73(sp)
+; RV64I-NEXT:    sb s8, 72(sp)
+; RV64I-NEXT:    sb s7, 71(sp)
+; RV64I-NEXT:    sb s6, 70(sp)
+; RV64I-NEXT:    sb s5, 69(sp)
+; RV64I-NEXT:    sb s4, 68(sp)
+; RV64I-NEXT:    sb s3, 67(sp)
+; RV64I-NEXT:    sb s2, 66(sp)
+; RV64I-NEXT:    sb s1, 65(sp)
+; RV64I-NEXT:    sb s0, 64(sp)
+; RV64I-NEXT:    sb t0, 63(sp)
+; RV64I-NEXT:    sb t5, 62(sp)
+; RV64I-NEXT:    sb t4, 61(sp)
+; RV64I-NEXT:    sb t3, 60(sp)
+; RV64I-NEXT:    sb t2, 59(sp)
+; RV64I-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
+; RV64I-NEXT:    srai a0, t6, 63
 ; RV64I-NEXT:    sb a0, 112(sp)
 ; RV64I-NEXT:    sb a0, 104(sp)
 ; RV64I-NEXT:    sb a0, 96(sp)
@@ -2915,108 +2933,111 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a6, 91(sp)
 ; RV64I-NEXT:    sb a7, 90(sp)
 ; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    slli a0, t2, 56
+; RV64I-NEXT:    ld a7, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    slli a0, a7, 56
 ; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    lbu a0, 9(a1)
-; RV64I-NEXT:    lbu a3, 8(a1)
-; RV64I-NEXT:    lbu a4, 10(a1)
-; RV64I-NEXT:    lbu a5, 11(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a3, 13(a1)
-; RV64I-NEXT:    lbu a4, 12(a1)
-; RV64I-NEXT:    lbu a5, 14(a1)
-; RV64I-NEXT:    lbu a6, 15(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    lbu a1, 12(a0)
+; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    lbu t3, 16(a0)
+; RV64I-NEXT:    lbu t4, 17(a0)
+; RV64I-NEXT:    lbu t5, 18(a0)
+; RV64I-NEXT:    lbu t6, 19(a0)
+; RV64I-NEXT:    lbu s0, 20(a0)
+; RV64I-NEXT:    lbu s1, 21(a0)
+; RV64I-NEXT:    lbu s2, 22(a0)
+; RV64I-NEXT:    lbu s3, 23(a0)
+; RV64I-NEXT:    lbu s4, 24(a0)
+; RV64I-NEXT:    lbu s5, 25(a0)
+; RV64I-NEXT:    lbu s6, 26(a0)
+; RV64I-NEXT:    lbu s7, 27(a0)
+; RV64I-NEXT:    lbu s8, 28(a0)
+; RV64I-NEXT:    lbu s9, 29(a0)
+; RV64I-NEXT:    lbu s10, 30(a0)
+; RV64I-NEXT:    lbu s11, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    lbu ra, 7(a0)
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a1, a6, a5
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    ld a3, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    or a3, t0, a3
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a4, t2, t1
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a4, a3, a0
-; RV64I-NEXT:    andi a3, t2, 7
-; RV64I-NEXT:    lbu a0, 17(a1)
-; RV64I-NEXT:    lbu a5, 16(a1)
-; RV64I-NEXT:    lbu a6, 18(a1)
-; RV64I-NEXT:    lbu a7, 19(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a1)
-; RV64I-NEXT:    lbu a6, 20(a1)
-; RV64I-NEXT:    lbu a7, 22(a1)
-; RV64I-NEXT:    lbu t0, 23(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a4, a3, a1
+; RV64I-NEXT:    andi a3, a7, 7
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    or a1, t4, t3
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a5, t6, t5
+; RV64I-NEXT:    or a1, a5, a1
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a5, s3, s2
+; RV64I-NEXT:    or a5, a5, s0
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
+; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    slli a1, a5, 1
 ; RV64I-NEXT:    not a6, a3
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    sll a1, a1, a6
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu t2, 4(a0)
+; RV64I-NEXT:    lbu t3, 5(a0)
+; RV64I-NEXT:    lbu a0, 6(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a1)
-; RV64I-NEXT:    lbu t0, 24(a1)
-; RV64I-NEXT:    lbu t1, 26(a1)
-; RV64I-NEXT:    lbu t2, 27(a1)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a1)
-; RV64I-NEXT:    lbu t1, 28(a1)
-; RV64I-NEXT:    lbu t2, 30(a1)
-; RV64I-NEXT:    lbu a1, 31(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    slli t3, t3, 8
+; RV64I-NEXT:    or a7, t3, t2
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    slli ra, ra, 24
+; RV64I-NEXT:    or a0, ra, a0
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a6, a0, a6
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    or a0, s5, s4
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or a7, s7, s6
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or a7, s9, s8
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    or t0, s11, s10
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    xori t0, a3, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll t0, a1, t0
-; RV64I-NEXT:    srl a1, a4, a3
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a7, a7, a0
+; RV64I-NEXT:    slli a0, a7, 1
+; RV64I-NEXT:    sll t0, a0, t0
+; RV64I-NEXT:    srl a0, a4, a3
 ; RV64I-NEXT:    srl a4, a6, a3
 ; RV64I-NEXT:    srl a5, a5, a3
 ; RV64I-NEXT:    sra a3, a7, a3
@@ -3063,26 +3084,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a1, 48
+; RV64I-NEXT:    srli a4, a0, 48
 ; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a1, 40
+; RV64I-NEXT:    srli a4, a0, 40
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a1, 32
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a1, 24
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a1, 16
+; RV64I-NEXT:    srli a4, a0, 16
 ; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a6, 56
-; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    sb a0, 23(a2)
 ; RV64I-NEXT:    srli a3, a3, 56
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
+; RV64I-NEXT:    srli a1, a1, 56
+; RV64I-NEXT:    sb a1, 15(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
@@ -3115,94 +3136,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a4, 1(a1)
+; RV32I-NEXT:    lbu a7, 31(a0)
+; RV32I-NEXT:    lbu t0, 30(a0)
+; RV32I-NEXT:    lbu t1, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or a3, a3, s11
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    or t1, a1, a3
-; RV32I-NEXT:    lbu t0, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a4, 28(a0)
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 1(a0)
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 2(a0)
+; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 3(a0)
+; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu t5, 7(a0)
+; RV32I-NEXT:    lbu t6, 8(a0)
+; RV32I-NEXT:    lbu s0, 9(a0)
+; RV32I-NEXT:    lbu s1, 10(a0)
+; RV32I-NEXT:    lbu s2, 11(a0)
+; RV32I-NEXT:    lbu s3, 12(a0)
+; RV32I-NEXT:    lbu s4, 13(a0)
+; RV32I-NEXT:    lbu s5, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s7, 16(a0)
+; RV32I-NEXT:    lbu s8, 17(a0)
+; RV32I-NEXT:    lbu s9, 18(a0)
+; RV32I-NEXT:    lbu s10, 19(a0)
+; RV32I-NEXT:    lbu s11, 20(a0)
+; RV32I-NEXT:    lbu ra, 21(a0)
+; RV32I-NEXT:    lbu a6, 22(a0)
+; RV32I-NEXT:    lbu a5, 23(a0)
+; RV32I-NEXT:    lbu a4, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu a1, 26(a0)
 ; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a4, 56(sp)
+; RV32I-NEXT:    sb t0, 58(sp)
+; RV32I-NEXT:    sb t2, 57(sp)
+; RV32I-NEXT:    sb t3, 56(sp)
 ; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb t0, 51(sp)
-; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
-; RV32I-NEXT:    sb s10, 48(sp)
-; RV32I-NEXT:    sb s9, 47(sp)
-; RV32I-NEXT:    sb s8, 46(sp)
-; RV32I-NEXT:    sb s7, 45(sp)
-; RV32I-NEXT:    sb s6, 44(sp)
-; RV32I-NEXT:    sb s5, 43(sp)
-; RV32I-NEXT:    sb t3, 59(sp)
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb s1, 39(sp)
-; RV32I-NEXT:    sb s0, 38(sp)
-; RV32I-NEXT:    sb t6, 37(sp)
-; RV32I-NEXT:    sb t5, 36(sp)
-; RV32I-NEXT:    sb t4, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
+; RV32I-NEXT:    sb a1, 54(sp)
+; RV32I-NEXT:    sb a3, 53(sp)
+; RV32I-NEXT:    sb a7, 59(sp)
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    sb a4, 52(sp)
+; RV32I-NEXT:    sb a5, 51(sp)
+; RV32I-NEXT:    sb a6, 50(sp)
+; RV32I-NEXT:    sb ra, 49(sp)
+; RV32I-NEXT:    sb s11, 48(sp)
+; RV32I-NEXT:    sb s10, 47(sp)
+; RV32I-NEXT:    sb s9, 46(sp)
+; RV32I-NEXT:    sb s8, 45(sp)
+; RV32I-NEXT:    sb s7, 44(sp)
+; RV32I-NEXT:    sb s6, 43(sp)
+; RV32I-NEXT:    sb s5, 42(sp)
+; RV32I-NEXT:    sb s4, 41(sp)
+; RV32I-NEXT:    sb s3, 40(sp)
+; RV32I-NEXT:    sb s2, 39(sp)
+; RV32I-NEXT:    sb s1, 38(sp)
+; RV32I-NEXT:    sb s0, 37(sp)
+; RV32I-NEXT:    sb t6, 36(sp)
+; RV32I-NEXT:    sb t5, 35(sp)
+; RV32I-NEXT:    sb t4, 34(sp)
+; RV32I-NEXT:    sb t1, 33(sp)
 ; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t3, 31
+; RV32I-NEXT:    srai a0, a7, 31
 ; RV32I-NEXT:    sb a0, 88(sp)
 ; RV32I-NEXT:    sb a0, 84(sp)
 ; RV32I-NEXT:    sb a0, 80(sp)
@@ -3238,176 +3258,184 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a1, 63(sp)
 ; RV32I-NEXT:    sb a3, 62(sp)
 ; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    slli a0, t1, 24
+; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t1, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    addi a1, sp, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a1, 4(a0)
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu t0, 6(a0)
+; RV32I-NEXT:    lbu a3, 7(a0)
+; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 8(a0)
+; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 9(a0)
+; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t1, 10(a0)
+; RV32I-NEXT:    lbu t2, 11(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu t4, 13(a0)
+; RV32I-NEXT:    lbu t5, 14(a0)
+; RV32I-NEXT:    lbu t6, 15(a0)
+; RV32I-NEXT:    lbu s0, 16(a0)
+; RV32I-NEXT:    lbu s1, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu s4, 20(a0)
+; RV32I-NEXT:    lbu s6, 21(a0)
+; RV32I-NEXT:    lbu s7, 22(a0)
+; RV32I-NEXT:    lbu s8, 23(a0)
+; RV32I-NEXT:    lbu s5, 24(a0)
+; RV32I-NEXT:    lbu s9, 25(a0)
+; RV32I-NEXT:    lbu s10, 26(a0)
+; RV32I-NEXT:    lbu s11, 27(a0)
+; RV32I-NEXT:    lbu ra, 28(a0)
+; RV32I-NEXT:    lbu a5, 29(a0)
+; RV32I-NEXT:    lbu a7, 30(a0)
+; RV32I-NEXT:    lbu a6, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    or a4, a4, a1
+; RV32I-NEXT:    slli a1, t0, 16
+; RV32I-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    or a1, t0, a1
+; RV32I-NEXT:    or a4, a1, a4
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    lw t0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    lbu t1, 1(a0)
+; RV32I-NEXT:    lbu t2, 0(a0)
+; RV32I-NEXT:    lbu a0, 2(a0)
+; RV32I-NEXT:    or t0, t0, a1
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a1, t1, t2
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or a0, t4, t3
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or a3, t6, t5
+; RV32I-NEXT:    or t2, a3, a0
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or a0, s3, s2
+; RV32I-NEXT:    or s0, a0, s0
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or a0, s6, s4
+; RV32I-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    andi t3, a3, 7
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    or a3, s8, s7
+; RV32I-NEXT:    slli t1, t0, 1
+; RV32I-NEXT:    or t4, a3, a0
+; RV32I-NEXT:    not a3, t3
+; RV32I-NEXT:    sll a0, t1, a3
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    or t5, s9, s5
+; RV32I-NEXT:    slli t6, a4, 1
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s1, s11, s10
+; RV32I-NEXT:    slli t1, s0, 1
+; RV32I-NEXT:    sll t1, t1, a3
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    slli s1, t5, 1
+; RV32I-NEXT:    sll s1, s1, a3
+; RV32I-NEXT:    xori s2, t3, 31
+; RV32I-NEXT:    sll a3, t6, s2
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    sra a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
+; RV32I-NEXT:    or a5, a5, ra
+; RV32I-NEXT:    slli t6, t2, 1
+; RV32I-NEXT:    sll t6, t6, s2
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a7, t4, 1
+; RV32I-NEXT:    sll a7, a7, s2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    slli a6, a5, 1
+; RV32I-NEXT:    sll a6, a6, s2
+; RV32I-NEXT:    srl a4, a4, t3
+; RV32I-NEXT:    srl a1, a1, t3
+; RV32I-NEXT:    srl t2, t2, t3
+; RV32I-NEXT:    srl t0, t0, t3
+; RV32I-NEXT:    srl t4, t4, t3
+; RV32I-NEXT:    srl s0, s0, t3
+; RV32I-NEXT:    srl t5, t5, t3
+; RV32I-NEXT:    sra a5, a5, t3
+; RV32I-NEXT:    srli t3, t5, 16
+; RV32I-NEXT:    sb t3, 26(a2)
+; RV32I-NEXT:    or a6, t5, a6
 ; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
+; RV32I-NEXT:    srli t3, t5, 8
+; RV32I-NEXT:    sb t3, 25(a2)
+; RV32I-NEXT:    srli t3, a5, 24
+; RV32I-NEXT:    sb t3, 31(a2)
+; RV32I-NEXT:    srli t3, a5, 16
+; RV32I-NEXT:    sb t3, 30(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a5, s0, 16
+; RV32I-NEXT:    sb a5, 18(a2)
+; RV32I-NEXT:    or a5, s0, a7
+; RV32I-NEXT:    sb s0, 16(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 17(a2)
+; RV32I-NEXT:    srli a7, t4, 16
+; RV32I-NEXT:    sb a7, 22(a2)
+; RV32I-NEXT:    or a7, t4, s1
+; RV32I-NEXT:    sb t4, 20(a2)
+; RV32I-NEXT:    srli t3, t4, 8
 ; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
+; RV32I-NEXT:    srli t3, t0, 16
 ; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    or t3, t0, t6
+; RV32I-NEXT:    sb t0, 8(a2)
+; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    sb t0, 9(a2)
+; RV32I-NEXT:    srli t0, t2, 16
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    sb t2, 12(a2)
+; RV32I-NEXT:    srli t1, t2, 8
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    srli t1, a1, 16
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    or a3, a1, a3
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a1, a6, 24
+; RV32I-NEXT:    sb a1, 27(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 23(a2)
+; RV32I-NEXT:    srli a1, t3, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a1, t0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    sb a3, 3(a2)
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
index 34900b300691596..95cb61ef0505193 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll
@@ -57,14 +57,14 @@ define i64 @lwud(i32* %a) {
 define i64 @ldd(i64* %a) {
 ; RV32XTHEADMEMPAIR-LABEL: ldd:
 ; RV32XTHEADMEMPAIR:       # %bb.0:
-; RV32XTHEADMEMPAIR-NEXT:    lw a1, 32(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a2, 36(a0)
-; RV32XTHEADMEMPAIR-NEXT:    lw a3, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a1, 44(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a2, 32(a0)
+; RV32XTHEADMEMPAIR-NEXT:    lw a3, 36(a0)
 ; RV32XTHEADMEMPAIR-NEXT:    lw a0, 40(a0)
-; RV32XTHEADMEMPAIR-NEXT:    add a2, a2, a3
-; RV32XTHEADMEMPAIR-NEXT:    add a0, a1, a0
-; RV32XTHEADMEMPAIR-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMPAIR-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a3, a1
+; RV32XTHEADMEMPAIR-NEXT:    add a0, a2, a0
+; RV32XTHEADMEMPAIR-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMPAIR-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMPAIR-NEXT:    ret
 ;
 ; RV64XTHEADMEMPAIR-LABEL: ldd:



More information about the llvm-commits mailing list