[llvm] [Codegen][LegalizeIntegerTypes] Improve shift through stack (PR #96151)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 20 02:07:41 PDT 2024


https://github.com/futog created https://github.com/llvm/llvm-project/pull/96151

Minor improvement on cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.

If the target does not support unaligned memory access, use native register aligment instead of byte alignment. The shift amount is also splitted based on the native aligment, so load happens only from aligned addresses.

>From 0b8dea8d16aaeb8c423adf965af5f605cadd201e Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Thu, 20 Jun 2024 10:08:16 +0200
Subject: [PATCH] [Codegen][LegalizeIntegerTypes] Improve shift through stack

Minor improvement on cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.

If the target does not support unaligned memory access, use native
register aligment instead of byte alignment. The shift amount is also
splitted based on the native aligment, so load happens only from aligned
addresses.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   58 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  366 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 3119 +++++++-------
 .../RISCV/wide-scalar-shift-legalization.ll   | 3581 +++++++----------
 4 files changed, 3029 insertions(+), 4095 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a058b509b3aca..f21ed7581a5af 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4530,14 +4530,25 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   SDValue ShAmt = N->getOperand(1);
   EVT ShAmtVT = ShAmt.getValueType();
 
-  // This legalization is optimal when the shift is by a multiple of byte width,
-  //   %x * 8 <-> %x << 3   so 3 low bits should be be known zero.
-  bool ShiftByByteMultiple =
-      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+  EVT LoadStoreVT = VT;
+  do {
+      LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
+  }while (!TLI.isTypeLegal(LoadStoreVT));
+
+  const Align LoadStoreAlign = [&]() -> Align {
+      if (TLI.allowsMisalignedMemoryAccesses(LoadStoreVT))
+          return Align(1);
+
+      return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
+  }();
+
+  const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
+  const bool IsOneStepShift =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= Log2_32(ShiftUnitInBits);
 
   // If we can't do it as one step, we'll have two uses of shift amount,
   // and thus must freeze it.
-  if (!ShiftByByteMultiple)
+  if (!IsOneStepShift)
     ShAmt = DAG.getFreeze(ShAmt);
 
   unsigned VTBitWidth = VT.getScalarSizeInBits();
@@ -4551,8 +4562,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
 
   // Get a temporary stack slot 2x the width of our VT.
   // FIXME: reuse stack slots?
-  // FIXME: should we be more picky about alignment?
-  Align StackSlotAlignment(1);
+  Align StackSlotAlignment(LoadStoreAlign);
   SDValue StackPtr = DAG.CreateStackTemporary(
       TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
   EVT PtrTy = StackPtr.getValueType();
@@ -4577,16 +4587,22 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
 
   // Now, compute the full-byte offset into stack slot from where we can load.
-  // We have shift amount, which is in bits, but in multiples of byte.
-  // So just divide by CHAR_BIT.
+  // We have shift amount, which is in bits. Offset should point to an aligned
+  // address.
   SDNodeFlags Flags;
-  if (ShiftByByteMultiple)
+  if (IsOneStepShift)
     Flags.setExact(true);
-  SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
-                                   DAG.getConstant(3, dl, ShAmtVT), Flags);
+  SDValue OffsetInBits = DAG.getNode(ISD::SHL, dl, ShAmtVT,
+                                     DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags),
+                                     DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+  Flags.setExact(true);
+  SDValue Offset = DAG.getNode(
+      ISD::SRL, dl, ShAmtVT,
+      OffsetInBits,
+      DAG.getConstant(3, dl, ShAmtVT), Flags);
   // And clamp it, because OOB load is an immediate UB,
   // while shift overflow would have *just* been poison.
-  ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
+  Offset = DAG.getNode(ISD::AND, dl, ShAmtVT, Offset,
                            DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
   // We have exactly two strategies on indexing into stack slot here:
   // 1. upwards starting from the beginning of the slot
@@ -4603,23 +4619,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   } else {
     AdjStackPtr = DAG.getMemBasePlusOffset(
         StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
-    ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
+    Offset = DAG.getNegative(Offset, dl, ShAmtVT);
   }
 
   // Get the pointer somewhere into the stack slot from which we need to load.
-  ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
-  AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
+  Offset = DAG.getSExtOrTrunc(Offset, dl, PtrTy);
+  AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
 
   // And load it! While the load is not legal, legalizing it is obvious.
   SDValue Res = DAG.getLoad(
       VT, dl, Ch, AdjStackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
-  // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), LoadStoreAlign);
+  // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
 
-  // If we may still have a less-than-CHAR_BIT to shift by, do so now.
-  if (!ShiftByByteMultiple) {
+  // If we may still have a remaining bits to shift by, do so now.
+  if (!IsOneStepShift) {
     SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
-                                   DAG.getConstant(7, dl, ShAmtVT));
+                                   DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
     Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
   }
 
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed725..5ba8755201ddf 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb a1, 12(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    sb a4, 4(sp)
-; RV32I-NEXT:    sb a3, 0(sp)
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 14(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(sp)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 11(sp)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 10(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(sp)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(sp)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 3(sp)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 2(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    andi a6, a2, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 8(a1)
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a5, a7, 1
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, a7, a2
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
-; RV32I-NEXT:    sb a5, 4(sp)
-; RV32I-NEXT:    sb a1, 0(sp)
-; RV32I-NEXT:    srai a6, a3, 31
-; RV32I-NEXT:    sb a6, 28(sp)
-; RV32I-NEXT:    sb a6, 24(sp)
-; RV32I-NEXT:    sb a6, 20(sp)
-; RV32I-NEXT:    sb a6, 16(sp)
-; RV32I-NEXT:    srli a7, a3, 24
-; RV32I-NEXT:    sb a7, 15(sp)
-; RV32I-NEXT:    srli a7, a3, 16
-; RV32I-NEXT:    sb a7, 14(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(sp)
-; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 11(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 10(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(sp)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 7(sp)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 6(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 5(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(sp)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(sp)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    srli a3, a6, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw a3, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a1, 0(sp)
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a3, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    andi a6, a2, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 8(a1)
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a5, a7, 1
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, a7, a2
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb zero, 11(sp)
-; RV32I-NEXT:    sb zero, 10(sp)
-; RV32I-NEXT:    sb zero, 9(sp)
-; RV32I-NEXT:    sb zero, 8(sp)
-; RV32I-NEXT:    sb zero, 7(sp)
-; RV32I-NEXT:    sb zero, 6(sp)
-; RV32I-NEXT:    sb zero, 5(sp)
-; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb zero, 3(sp)
-; RV32I-NEXT:    sb zero, 2(sp)
-; RV32I-NEXT:    sb zero, 1(sp)
-; RV32I-NEXT:    sb zero, 0(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a5, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a3, 16(sp)
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 30(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 29(sp)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 26(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 25(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 22(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 18(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a1, a3, a1
-; RV32I-NEXT:    lbu a3, 5(a1)
-; RV32I-NEXT:    lbu a4, 4(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    sll a4, a3, a2
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu t0, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    srli a6, a5, 1
-; RV32I-NEXT:    xori a7, a2, 31
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lw a1, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a1, a2
+; RV32I-NEXT:    srli a6, a4, 1
+; RV32I-NEXT:    andi a7, a2, 31
+; RV32I-NEXT:    lw t0, 8(a3)
+; RV32I-NEXT:    xori a7, a7, 31
 ; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 9(a1)
-; RV32I-NEXT:    lbu t0, 8(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    sll t0, a6, a2
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not t1, a2
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    lbu t0, 13(a1)
-; RV32I-NEXT:    lbu t1, 12(a1)
-; RV32I-NEXT:    lbu t2, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sll a6, t0, a2
+; RV32I-NEXT:    lw a3, 12(a3)
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    srl a1, a1, a7
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    sll a3, a3, a2
+; RV32I-NEXT:    srli a6, t0, 1
 ; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    sll a2, a5, a2
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    sll a2, a4, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
-; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92b..0b87bb05cfd63 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb s2, 18(sp)
-; RV32I-NEXT:    sb s1, 17(sp)
-; RV32I-NEXT:    sb s0, 16(sp)
-; RV32I-NEXT:    sb t6, 15(sp)
-; RV32I-NEXT:    sb t5, 14(sp)
-; RV32I-NEXT:    sb t4, 13(sp)
-; RV32I-NEXT:    sb t3, 12(sp)
-; RV32I-NEXT:    sb t2, 11(sp)
-; RV32I-NEXT:    sb t1, 10(sp)
-; RV32I-NEXT:    sb t0, 9(sp)
-; RV32I-NEXT:    sb a7, 8(sp)
-; RV32I-NEXT:    sb a6, 7(sp)
-; RV32I-NEXT:    sb a5, 6(sp)
-; RV32I-NEXT:    sb a4, 5(sp)
-; RV32I-NEXT:    sb a3, 4(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    addi a0, sp, 4
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or t0, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sll a6, t1, a6
+; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -942,98 +961,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb zero, 11(sp)
-; RV32I-NEXT:    sb zero, 10(sp)
-; RV32I-NEXT:    sb zero, 9(sp)
-; RV32I-NEXT:    sb zero, 8(sp)
-; RV32I-NEXT:    sb zero, 7(sp)
-; RV32I-NEXT:    sb zero, 6(sp)
-; RV32I-NEXT:    sb zero, 5(sp)
-; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb s2, 34(sp)
-; RV32I-NEXT:    sb s1, 33(sp)
-; RV32I-NEXT:    sb s0, 32(sp)
-; RV32I-NEXT:    sb t6, 31(sp)
-; RV32I-NEXT:    sb t5, 30(sp)
-; RV32I-NEXT:    sb t4, 29(sp)
-; RV32I-NEXT:    sb t3, 28(sp)
-; RV32I-NEXT:    sb t2, 27(sp)
-; RV32I-NEXT:    sb t1, 26(sp)
-; RV32I-NEXT:    sb t0, 25(sp)
-; RV32I-NEXT:    sb a7, 24(sp)
-; RV32I-NEXT:    sb a6, 23(sp)
-; RV32I-NEXT:    sb a5, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a3, 20(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    addi a0, sp, 20
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a0, a1
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    lw t0, 12(a3)
+; RV32I-NEXT:    lw a3, 8(a3)
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    sll t0, t0, a1
+; RV32I-NEXT:    srli t1, a3, 1
+; RV32I-NEXT:    srl t1, t1, a6
+; RV32I-NEXT:    or t1, t0, t1
+; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 7(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    srli a0, t1, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, t1, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1161,105 +1199,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu a0, 13(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 15(sp)
-; RV32I-NEXT:    sb s3, 14(sp)
-; RV32I-NEXT:    sb a0, 13(sp)
-; RV32I-NEXT:    sb s2, 12(sp)
-; RV32I-NEXT:    sb s1, 11(sp)
-; RV32I-NEXT:    sb s0, 10(sp)
-; RV32I-NEXT:    sb t6, 9(sp)
-; RV32I-NEXT:    sb t5, 8(sp)
-; RV32I-NEXT:    sb t4, 7(sp)
-; RV32I-NEXT:    sb t3, 6(sp)
-; RV32I-NEXT:    sb t2, 5(sp)
-; RV32I-NEXT:    sb t1, 4(sp)
-; RV32I-NEXT:    sb t0, 3(sp)
-; RV32I-NEXT:    sb a7, 2(sp)
-; RV32I-NEXT:    sb a6, 1(sp)
-; RV32I-NEXT:    sb a5, 0(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a4, 16(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a0, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or t0, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sll a6, t1, a6
+; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1272,441 +1323,429 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    lbu ra, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
-; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 87(sp)
-; RV64I-NEXT:    sb a4, 86(sp)
-; RV64I-NEXT:    sb a0, 85(sp)
-; RV64I-NEXT:    sb a5, 84(sp)
-; RV64I-NEXT:    sb a6, 83(sp)
-; RV64I-NEXT:    sb a7, 82(sp)
-; RV64I-NEXT:    sb zero, 119(sp)
-; RV64I-NEXT:    sb zero, 118(sp)
-; RV64I-NEXT:    sb zero, 117(sp)
-; RV64I-NEXT:    sb zero, 116(sp)
-; RV64I-NEXT:    sb zero, 115(sp)
-; RV64I-NEXT:    sb zero, 114(sp)
-; RV64I-NEXT:    sb zero, 113(sp)
-; RV64I-NEXT:    sb zero, 112(sp)
-; RV64I-NEXT:    sb zero, 111(sp)
-; RV64I-NEXT:    sb zero, 110(sp)
-; RV64I-NEXT:    sb zero, 109(sp)
-; RV64I-NEXT:    sb zero, 108(sp)
-; RV64I-NEXT:    sb zero, 107(sp)
-; RV64I-NEXT:    sb zero, 106(sp)
-; RV64I-NEXT:    sb zero, 105(sp)
-; RV64I-NEXT:    sb zero, 104(sp)
-; RV64I-NEXT:    sb zero, 103(sp)
-; RV64I-NEXT:    sb zero, 102(sp)
-; RV64I-NEXT:    sb zero, 101(sp)
-; RV64I-NEXT:    sb zero, 100(sp)
-; RV64I-NEXT:    sb zero, 99(sp)
-; RV64I-NEXT:    sb zero, 98(sp)
-; RV64I-NEXT:    sb zero, 97(sp)
-; RV64I-NEXT:    sb zero, 96(sp)
-; RV64I-NEXT:    sb zero, 95(sp)
-; RV64I-NEXT:    sb zero, 94(sp)
-; RV64I-NEXT:    sb zero, 93(sp)
-; RV64I-NEXT:    sb zero, 92(sp)
-; RV64I-NEXT:    sb zero, 91(sp)
-; RV64I-NEXT:    sb zero, 90(sp)
-; RV64I-NEXT:    sb zero, 89(sp)
-; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t0, 81(sp)
-; RV64I-NEXT:    sb ra, 80(sp)
-; RV64I-NEXT:    sb s11, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
-; RV64I-NEXT:    sb t3, 64(sp)
-; RV64I-NEXT:    sb t2, 63(sp)
-; RV64I-NEXT:    sb t1, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    andi a1, a1, 31
-; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    srl a5, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
-; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 59(sp)
-; RV32I-NEXT:    sb a4, 58(sp)
-; RV32I-NEXT:    sb a0, 57(sp)
-; RV32I-NEXT:    sb a5, 56(sp)
-; RV32I-NEXT:    sb a6, 55(sp)
-; RV32I-NEXT:    sb a7, 54(sp)
-; RV32I-NEXT:    sb zero, 91(sp)
-; RV32I-NEXT:    sb zero, 90(sp)
-; RV32I-NEXT:    sb zero, 89(sp)
-; RV32I-NEXT:    sb zero, 88(sp)
-; RV32I-NEXT:    sb zero, 87(sp)
-; RV32I-NEXT:    sb zero, 86(sp)
-; RV32I-NEXT:    sb zero, 85(sp)
-; RV32I-NEXT:    sb zero, 84(sp)
-; RV32I-NEXT:    sb zero, 83(sp)
-; RV32I-NEXT:    sb zero, 82(sp)
-; RV32I-NEXT:    sb zero, 81(sp)
-; RV32I-NEXT:    sb zero, 80(sp)
-; RV32I-NEXT:    sb zero, 79(sp)
-; RV32I-NEXT:    sb zero, 78(sp)
-; RV32I-NEXT:    sb zero, 77(sp)
-; RV32I-NEXT:    sb zero, 76(sp)
-; RV32I-NEXT:    sb zero, 75(sp)
-; RV32I-NEXT:    sb zero, 74(sp)
-; RV32I-NEXT:    sb zero, 73(sp)
-; RV32I-NEXT:    sb zero, 72(sp)
-; RV32I-NEXT:    sb zero, 71(sp)
-; RV32I-NEXT:    sb zero, 70(sp)
-; RV32I-NEXT:    sb zero, 69(sp)
-; RV32I-NEXT:    sb zero, 68(sp)
-; RV32I-NEXT:    sb zero, 67(sp)
-; RV32I-NEXT:    sb zero, 66(sp)
-; RV32I-NEXT:    sb zero, 65(sp)
-; RV32I-NEXT:    sb zero, 64(sp)
-; RV32I-NEXT:    sb zero, 63(sp)
-; RV32I-NEXT:    sb zero, 62(sp)
-; RV32I-NEXT:    sb zero, 61(sp)
-; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb t0, 53(sp)
-; RV32I-NEXT:    sb ra, 52(sp)
-; RV32I-NEXT:    sb s11, 51(sp)
-; RV32I-NEXT:    sb s10, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
-; RV32I-NEXT:    sb s3, 43(sp)
-; RV32I-NEXT:    sb s2, 42(sp)
-; RV32I-NEXT:    sb s1, 41(sp)
-; RV32I-NEXT:    sb s0, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 28
-; RV32I-NEXT:    add a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
-; RV32I-NEXT:    ret
-  %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a5, a3, a0
+; RV32I-NEXT:    lw a3, 4(a5)
+; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    srl a4, a3, a6
+; RV32I-NEXT:    lw a7, 8(a5)
+; RV32I-NEXT:    andi a0, a6, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a1, 0(a5)
+; RV32I-NEXT:    slli a0, a7, 1
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srl t1, a1, a6
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw t2, 12(a5)
+; RV32I-NEXT:    lw t3, 16(a5)
+; RV32I-NEXT:    sll a1, a3, t0
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    slli a3, t3, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    srl s0, t5, a6
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    srl t3, t3, a6
+; RV32I-NEXT:    slli t5, t5, 1
+; RV32I-NEXT:    lw a5, 28(a5)
+; RV32I-NEXT:    sll t5, t5, t0
+; RV32I-NEXT:    or t5, t3, t5
+; RV32I-NEXT:    srl t6, t6, a6
+; RV32I-NEXT:    slli s2, a5, 1
+; RV32I-NEXT:    sll t0, s2, t0
+; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    sb t3, 16(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 31(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, s1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, s1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
   %res = lshr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
@@ -1715,438 +1754,426 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    lbu ra, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
-; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 119(sp)
-; RV64I-NEXT:    sb a4, 118(sp)
-; RV64I-NEXT:    sb a0, 117(sp)
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    sb a7, 114(sp)
-; RV64I-NEXT:    sb zero, 87(sp)
-; RV64I-NEXT:    sb zero, 86(sp)
-; RV64I-NEXT:    sb zero, 85(sp)
-; RV64I-NEXT:    sb zero, 84(sp)
-; RV64I-NEXT:    sb zero, 83(sp)
-; RV64I-NEXT:    sb zero, 82(sp)
-; RV64I-NEXT:    sb zero, 81(sp)
-; RV64I-NEXT:    sb zero, 80(sp)
-; RV64I-NEXT:    sb zero, 79(sp)
-; RV64I-NEXT:    sb zero, 78(sp)
-; RV64I-NEXT:    sb zero, 77(sp)
-; RV64I-NEXT:    sb zero, 76(sp)
-; RV64I-NEXT:    sb zero, 75(sp)
-; RV64I-NEXT:    sb zero, 74(sp)
-; RV64I-NEXT:    sb zero, 73(sp)
-; RV64I-NEXT:    sb zero, 72(sp)
-; RV64I-NEXT:    sb zero, 71(sp)
-; RV64I-NEXT:    sb zero, 70(sp)
-; RV64I-NEXT:    sb zero, 69(sp)
-; RV64I-NEXT:    sb zero, 68(sp)
-; RV64I-NEXT:    sb zero, 67(sp)
-; RV64I-NEXT:    sb zero, 66(sp)
-; RV64I-NEXT:    sb zero, 65(sp)
-; RV64I-NEXT:    sb zero, 64(sp)
-; RV64I-NEXT:    sb zero, 63(sp)
-; RV64I-NEXT:    sb zero, 62(sp)
-; RV64I-NEXT:    sb zero, 61(sp)
-; RV64I-NEXT:    sb zero, 60(sp)
-; RV64I-NEXT:    sb zero, 59(sp)
-; RV64I-NEXT:    sb zero, 58(sp)
-; RV64I-NEXT:    sb zero, 57(sp)
-; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t0, 113(sp)
-; RV64I-NEXT:    sb ra, 112(sp)
-; RV64I-NEXT:    sb s11, 111(sp)
-; RV64I-NEXT:    sb s10, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
-; RV64I-NEXT:    sb t2, 95(sp)
-; RV64I-NEXT:    sb t1, 94(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 93(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 92(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 91(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 90(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    andi a1, a1, 31
-; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    sll a6, a4, a1
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    srli a0, a5, 1
+; RV64I-NEXT:    ld t0, 24(a3)
+; RV64I-NEXT:    ld a3, 16(a3)
+; RV64I-NEXT:    srl a0, a0, a7
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    sll t0, t0, a1
+; RV64I-NEXT:    srli t1, a3, 1
+; RV64I-NEXT:    srl t1, t1, a7
+; RV64I-NEXT:    or t1, t0, t1
+; RV64I-NEXT:    sll a3, a3, a1
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a7
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    sll a1, a5, a1
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a3, t0, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    srli a1, a6, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb t1, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a1, t1, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, t1, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, t1, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, t1, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, t1, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a1, t1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
-; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 91(sp)
-; RV32I-NEXT:    sb a4, 90(sp)
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a5, 88(sp)
-; RV32I-NEXT:    sb a6, 87(sp)
-; RV32I-NEXT:    sb a7, 86(sp)
-; RV32I-NEXT:    sb zero, 59(sp)
-; RV32I-NEXT:    sb zero, 58(sp)
-; RV32I-NEXT:    sb zero, 57(sp)
-; RV32I-NEXT:    sb zero, 56(sp)
-; RV32I-NEXT:    sb zero, 55(sp)
-; RV32I-NEXT:    sb zero, 54(sp)
-; RV32I-NEXT:    sb zero, 53(sp)
-; RV32I-NEXT:    sb zero, 52(sp)
-; RV32I-NEXT:    sb zero, 51(sp)
-; RV32I-NEXT:    sb zero, 50(sp)
-; RV32I-NEXT:    sb zero, 49(sp)
-; RV32I-NEXT:    sb zero, 48(sp)
-; RV32I-NEXT:    sb zero, 47(sp)
-; RV32I-NEXT:    sb zero, 46(sp)
-; RV32I-NEXT:    sb zero, 45(sp)
-; RV32I-NEXT:    sb zero, 44(sp)
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb t0, 85(sp)
-; RV32I-NEXT:    sb ra, 84(sp)
-; RV32I-NEXT:    sb s11, 83(sp)
-; RV32I-NEXT:    sb s10, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
-; RV32I-NEXT:    sb s3, 75(sp)
-; RV32I-NEXT:    sb s2, 74(sp)
-; RV32I-NEXT:    sb s1, 73(sp)
-; RV32I-NEXT:    sb s0, 72(sp)
-; RV32I-NEXT:    sb t6, 71(sp)
-; RV32I-NEXT:    sb t5, 70(sp)
-; RV32I-NEXT:    sb t4, 69(sp)
-; RV32I-NEXT:    sb t3, 68(sp)
-; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a3, 36(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 36
+; RV32I-NEXT:    sub a6, a3, a0
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    slli a7, a1, 3
+; RV32I-NEXT:    lw t0, 0(a6)
+; RV32I-NEXT:    sll a4, a3, a7
+; RV32I-NEXT:    andi a0, a7, 24
+; RV32I-NEXT:    xori t1, a0, 31
+; RV32I-NEXT:    srli a0, t0, 1
+; RV32I-NEXT:    lw t2, 12(a6)
+; RV32I-NEXT:    lw a5, 8(a6)
+; RV32I-NEXT:    srl a0, a0, t1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sll t3, t2, a7
+; RV32I-NEXT:    srli a1, a5, 1
+; RV32I-NEXT:    srl a1, a1, t1
+; RV32I-NEXT:    or a1, t3, a1
+; RV32I-NEXT:    sll t4, a5, a7
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    lw t5, 20(a6)
+; RV32I-NEXT:    lw t6, 16(a6)
+; RV32I-NEXT:    srl a3, a3, t1
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    sll s0, t5, a7
+; RV32I-NEXT:    srli a5, t6, 1
+; RV32I-NEXT:    srl a5, a5, t1
+; RV32I-NEXT:    or a5, s0, a5
+; RV32I-NEXT:    sll t6, t6, a7
+; RV32I-NEXT:    srli t2, t2, 1
+; RV32I-NEXT:    lw s1, 28(a6)
+; RV32I-NEXT:    lw a6, 24(a6)
+; RV32I-NEXT:    srl t2, t2, t1
+; RV32I-NEXT:    or t2, t6, t2
+; RV32I-NEXT:    sll s1, s1, a7
+; RV32I-NEXT:    srli s2, a6, 1
+; RV32I-NEXT:    srl s2, s2, t1
+; RV32I-NEXT:    or s2, s1, s2
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    srli t5, t5, 1
+; RV32I-NEXT:    srl t1, t5, t1
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    sll a7, t0, a7
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    srli a6, a6, 24
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    srli s1, s1, 24
+; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    srli a6, t6, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli s0, s0, 24
+; RV32I-NEXT:    sb s0, 23(a2)
+; RV32I-NEXT:    srli a6, t4, 24
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    srli a6, t3, 24
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    sb t1, 24(a2)
+; RV32I-NEXT:    sb s2, 28(a2)
+; RV32I-NEXT:    sb t2, 16(a2)
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, s2, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a4, s2, 8
 ; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2158,454 +2185,428 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t0, a1
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 2(a0)
-; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 3(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 4(a0)
-; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 5(a0)
-; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    lbu t4, 8(a0)
-; RV64I-NEXT:    lbu t5, 9(a0)
-; RV64I-NEXT:    lbu t6, 10(a0)
-; RV64I-NEXT:    lbu s0, 11(a0)
-; RV64I-NEXT:    lbu s1, 12(a0)
-; RV64I-NEXT:    lbu s2, 13(a0)
-; RV64I-NEXT:    lbu s3, 14(a0)
-; RV64I-NEXT:    lbu s4, 15(a0)
-; RV64I-NEXT:    lbu s5, 16(a0)
-; RV64I-NEXT:    lbu s6, 17(a0)
-; RV64I-NEXT:    lbu s7, 18(a0)
-; RV64I-NEXT:    lbu s8, 19(a0)
-; RV64I-NEXT:    lbu s9, 20(a0)
-; RV64I-NEXT:    lbu s10, 21(a0)
-; RV64I-NEXT:    lbu s11, 22(a0)
-; RV64I-NEXT:    lbu ra, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a4, 27(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t0, 0(t0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a4, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb ra, 79(sp)
-; RV64I-NEXT:    sb s11, 78(sp)
-; RV64I-NEXT:    sb s10, 77(sp)
-; RV64I-NEXT:    sb s9, 76(sp)
-; RV64I-NEXT:    sb s8, 75(sp)
-; RV64I-NEXT:    sb s7, 74(sp)
-; RV64I-NEXT:    sb s6, 73(sp)
-; RV64I-NEXT:    sb s5, 72(sp)
-; RV64I-NEXT:    sb s4, 71(sp)
-; RV64I-NEXT:    sb s3, 70(sp)
-; RV64I-NEXT:    sb s2, 69(sp)
-; RV64I-NEXT:    sb s1, 68(sp)
-; RV64I-NEXT:    sb s0, 67(sp)
-; RV64I-NEXT:    sb t6, 66(sp)
-; RV64I-NEXT:    sb t5, 65(sp)
-; RV64I-NEXT:    sb t4, 64(sp)
-; RV64I-NEXT:    sb t3, 63(sp)
-; RV64I-NEXT:    sb t2, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
-; RV64I-NEXT:    sb a0, 112(sp)
-; RV64I-NEXT:    sb a0, 104(sp)
-; RV64I-NEXT:    sb a0, 96(sp)
-; RV64I-NEXT:    sb a0, 88(sp)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or t0, t0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    srl a5, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
 ; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    srli a6, a0, 24
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    srli a7, a0, 16
-; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
 ; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 113(sp)
-; RV64I-NEXT:    sb a1, 111(sp)
-; RV64I-NEXT:    sb a3, 110(sp)
-; RV64I-NEXT:    sb a4, 109(sp)
-; RV64I-NEXT:    sb a5, 108(sp)
-; RV64I-NEXT:    sb a6, 107(sp)
-; RV64I-NEXT:    sb a7, 106(sp)
-; RV64I-NEXT:    sb a0, 105(sp)
-; RV64I-NEXT:    sb a1, 103(sp)
-; RV64I-NEXT:    sb a3, 102(sp)
-; RV64I-NEXT:    sb a4, 101(sp)
-; RV64I-NEXT:    sb a5, 100(sp)
-; RV64I-NEXT:    sb a6, 99(sp)
-; RV64I-NEXT:    sb a7, 98(sp)
-; RV64I-NEXT:    sb a0, 97(sp)
-; RV64I-NEXT:    sb a1, 95(sp)
-; RV64I-NEXT:    sb a3, 94(sp)
-; RV64I-NEXT:    sb a4, 93(sp)
-; RV64I-NEXT:    sb a5, 92(sp)
-; RV64I-NEXT:    sb a6, 91(sp)
-; RV64I-NEXT:    sb a7, 90(sp)
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    andi a0, t0, 31
-; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a6, a1, a0
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    lbu t1, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 3(a0)
-; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 4(a0)
-; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t3, 7(a0)
-; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s6, 17(a0)
-; RV32I-NEXT:    lbu s7, 18(a0)
-; RV32I-NEXT:    lbu s8, 19(a0)
-; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 21(a0)
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a4, 27(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t0, 0(t0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a4, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb t1, 59(sp)
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s10, 49(sp)
-; RV32I-NEXT:    sb s9, 48(sp)
-; RV32I-NEXT:    sb s8, 47(sp)
-; RV32I-NEXT:    sb s7, 46(sp)
-; RV32I-NEXT:    sb s6, 45(sp)
-; RV32I-NEXT:    sb s5, 44(sp)
-; RV32I-NEXT:    sb s4, 43(sp)
-; RV32I-NEXT:    sb s3, 42(sp)
-; RV32I-NEXT:    sb s2, 41(sp)
-; RV32I-NEXT:    sb s1, 40(sp)
-; RV32I-NEXT:    sb s0, 39(sp)
-; RV32I-NEXT:    sb t6, 38(sp)
-; RV32I-NEXT:    sb t5, 37(sp)
-; RV32I-NEXT:    sb t4, 36(sp)
-; RV32I-NEXT:    sb t3, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t1, 31
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a0, 84(sp)
-; RV32I-NEXT:    sb a0, 80(sp)
-; RV32I-NEXT:    sb a0, 76(sp)
-; RV32I-NEXT:    sb a0, 72(sp)
-; RV32I-NEXT:    sb a0, 68(sp)
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a1, 87(sp)
-; RV32I-NEXT:    sb a3, 86(sp)
-; RV32I-NEXT:    sb a0, 85(sp)
-; RV32I-NEXT:    sb a1, 83(sp)
-; RV32I-NEXT:    sb a3, 82(sp)
-; RV32I-NEXT:    sb a0, 81(sp)
-; RV32I-NEXT:    sb a1, 79(sp)
-; RV32I-NEXT:    sb a3, 78(sp)
-; RV32I-NEXT:    sb a0, 77(sp)
-; RV32I-NEXT:    sb a1, 75(sp)
-; RV32I-NEXT:    sb a3, 74(sp)
-; RV32I-NEXT:    sb a0, 73(sp)
-; RV32I-NEXT:    sb a1, 71(sp)
-; RV32I-NEXT:    sb a3, 70(sp)
-; RV32I-NEXT:    sb a0, 69(sp)
-; RV32I-NEXT:    sb a1, 67(sp)
-; RV32I-NEXT:    sb a3, 66(sp)
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    sb a1, 63(sp)
-; RV32I-NEXT:    sb a3, 62(sp)
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    andi a0, t0, 31
-; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a6, a1, a0
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a1, a1, t3
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a5, a3, a0
+; RV32I-NEXT:    lw a3, 4(a5)
+; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    srl a4, a3, a6
+; RV32I-NEXT:    lw a7, 8(a5)
+; RV32I-NEXT:    andi a0, a6, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a1, 0(a5)
+; RV32I-NEXT:    slli a0, a7, 1
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srl t1, a1, a6
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw t2, 12(a5)
+; RV32I-NEXT:    lw t3, 16(a5)
+; RV32I-NEXT:    sll a1, a3, t0
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    slli a3, t3, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    srl s0, t5, a6
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    srl t3, t3, a6
+; RV32I-NEXT:    slli t5, t5, 1
+; RV32I-NEXT:    lw a5, 28(a5)
+; RV32I-NEXT:    sll t5, t5, t0
+; RV32I-NEXT:    or t5, t3, t5
+; RV32I-NEXT:    srl t6, t6, a6
+; RV32I-NEXT:    slli s2, a5, 1
+; RV32I-NEXT:    sll t0, s2, t0
+; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    sra a5, a5, a6
+; RV32I-NEXT:    sb t6, 24(a2)
 ; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb t3, 16(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
 ; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 31(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, s1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, s1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afa..7e879b137b4f0 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb s4, 26(sp)
-; RV32I-NEXT:    sb s3, 25(sp)
-; RV32I-NEXT:    sb s2, 24(sp)
-; RV32I-NEXT:    sb t6, 23(sp)
-; RV32I-NEXT:    sb t5, 22(sp)
-; RV32I-NEXT:    sb t4, 21(sp)
-; RV32I-NEXT:    sb t3, 20(sp)
-; RV32I-NEXT:    sb t2, 19(sp)
-; RV32I-NEXT:    sb t1, 18(sp)
-; RV32I-NEXT:    sb t0, 17(sp)
-; RV32I-NEXT:    sb a7, 16(sp)
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    sb a5, 14(sp)
-; RV32I-NEXT:    sb a4, 13(sp)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 12
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a4, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli a7, a0, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a5, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb a0, 43(sp)
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    sb t0, 33(sp)
-; RV32I-NEXT:    sb a7, 32(sp)
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    sb a5, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a3, 28(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 28
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    sll a0, a5, a4
-; RV32I-NEXT:    lbu a1, 1(a3)
-; RV32I-NEXT:    lbu a6, 0(a3)
-; RV32I-NEXT:    lbu a7, 2(a3)
-; RV32I-NEXT:    lbu t0, 3(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    xori a7, a4, 31
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu t0, 13(a3)
-; RV32I-NEXT:    lbu t1, 12(a3)
-; RV32I-NEXT:    lbu t2, 14(a3)
-; RV32I-NEXT:    lbu t3, 15(a3)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t0, t0, a4
-; RV32I-NEXT:    lbu t1, 9(a3)
-; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 10(a3)
-; RV32I-NEXT:    lbu a3, 11(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl a7, t1, a7
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a3, a3, a4
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    not t1, a4
-; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, a3, a5
-; RV32I-NEXT:    sll a4, a6, a4
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    srli a3, t0, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, t0, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a0, a1
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    lw t0, 12(a3)
+; RV32I-NEXT:    lw a3, 8(a3)
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    sll a7, t0, a1
+; RV32I-NEXT:    srli t0, a3, 1
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a1, 16
 ; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    srli a3, a1, 24
 ; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    sb a3, 7(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
 ; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 11(a0)
-; RV32I-NEXT:    lbu s4, 12(a0)
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    lbu s2, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu a0, 14(a0)
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s2
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sb a3, 23(sp)
-; RV32I-NEXT:    sb a0, 22(sp)
-; RV32I-NEXT:    sb s5, 21(sp)
-; RV32I-NEXT:    sb s4, 20(sp)
-; RV32I-NEXT:    sb s3, 19(sp)
-; RV32I-NEXT:    sb s0, 18(sp)
-; RV32I-NEXT:    sb t6, 17(sp)
-; RV32I-NEXT:    sb t5, 16(sp)
-; RV32I-NEXT:    sb t4, 15(sp)
-; RV32I-NEXT:    sb t3, 14(sp)
-; RV32I-NEXT:    sb t2, 13(sp)
-; RV32I-NEXT:    sb t1, 12(sp)
-; RV32I-NEXT:    sb t0, 11(sp)
-; RV32I-NEXT:    sb a7, 10(sp)
-; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 36(sp)
-; RV32I-NEXT:    sb a4, 32(sp)
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 39(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 38(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb a3, 34(sp)
-; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 8
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sra a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a4, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli a7, a0, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a5, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 87(sp)
-; RV64I-NEXT:    sb a3, 86(sp)
-; RV64I-NEXT:    sb a4, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a5, 83(sp)
-; RV64I-NEXT:    sb a6, 82(sp)
-; RV64I-NEXT:    sb a7, 81(sp)
-; RV64I-NEXT:    sb s11, 80(sp)
-; RV64I-NEXT:    sb s10, 79(sp)
-; RV64I-NEXT:    sb ra, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
-; RV64I-NEXT:    sb zero, 119(sp)
-; RV64I-NEXT:    sb zero, 118(sp)
-; RV64I-NEXT:    sb zero, 117(sp)
-; RV64I-NEXT:    sb zero, 116(sp)
-; RV64I-NEXT:    sb zero, 115(sp)
-; RV64I-NEXT:    sb zero, 114(sp)
-; RV64I-NEXT:    sb zero, 113(sp)
-; RV64I-NEXT:    sb zero, 112(sp)
-; RV64I-NEXT:    sb zero, 111(sp)
-; RV64I-NEXT:    sb zero, 110(sp)
-; RV64I-NEXT:    sb zero, 109(sp)
-; RV64I-NEXT:    sb zero, 108(sp)
-; RV64I-NEXT:    sb zero, 107(sp)
-; RV64I-NEXT:    sb zero, 106(sp)
-; RV64I-NEXT:    sb zero, 105(sp)
-; RV64I-NEXT:    sb zero, 104(sp)
-; RV64I-NEXT:    sb zero, 103(sp)
-; RV64I-NEXT:    sb zero, 102(sp)
-; RV64I-NEXT:    sb zero, 101(sp)
-; RV64I-NEXT:    sb zero, 100(sp)
-; RV64I-NEXT:    sb zero, 99(sp)
-; RV64I-NEXT:    sb zero, 98(sp)
-; RV64I-NEXT:    sb zero, 97(sp)
-; RV64I-NEXT:    sb zero, 96(sp)
-; RV64I-NEXT:    sb zero, 95(sp)
-; RV64I-NEXT:    sb zero, 94(sp)
-; RV64I-NEXT:    sb zero, 93(sp)
-; RV64I-NEXT:    sb zero, 92(sp)
-; RV64I-NEXT:    sb zero, 91(sp)
-; RV64I-NEXT:    sb zero, 90(sp)
-; RV64I-NEXT:    sb zero, 89(sp)
-; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t3, 64(sp)
-; RV64I-NEXT:    sb t2, 63(sp)
-; RV64I-NEXT:    sb t1, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    slli a0, t0, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a3, sp, 56
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    lbu a0, 9(a3)
-; RV64I-NEXT:    lbu a1, 8(a3)
-; RV64I-NEXT:    lbu a4, 10(a3)
-; RV64I-NEXT:    lbu a5, 11(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a1, 13(a3)
-; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a5, 14(a3)
-; RV64I-NEXT:    lbu a6, 15(a3)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a4, a1, a0
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a0, 17(a3)
-; RV64I-NEXT:    lbu a5, 16(a3)
-; RV64I-NEXT:    lbu a6, 18(a3)
-; RV64I-NEXT:    lbu a7, 19(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a3)
-; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu a7, 22(a3)
-; RV64I-NEXT:    lbu t0, 23(a3)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a1
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a3)
-; RV64I-NEXT:    lbu a7, 0(a3)
-; RV64I-NEXT:    lbu t0, 2(a3)
-; RV64I-NEXT:    lbu t1, 3(a3)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a3)
-; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t1, 6(a3)
-; RV64I-NEXT:    lbu t2, 7(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a3)
-; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t1, 26(a3)
-; RV64I-NEXT:    lbu t2, 27(a3)
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    lbu t1, 28(a3)
-; RV64I-NEXT:    lbu t2, 30(a3)
-; RV64I-NEXT:    lbu a3, 31(a3)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a3, t2
-; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a3, a3, t0
-; RV64I-NEXT:    xori t0, a1, 63
-; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a7, a3, a7
-; RV64I-NEXT:    slli a3, a7, 1
-; RV64I-NEXT:    sll t0, a3, t0
-; RV64I-NEXT:    srl a3, a4, a1
-; RV64I-NEXT:    srl a4, a6, a1
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    srl a0, a4, a1
+; RV64I-NEXT:    ld a5, 16(a3)
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    slli t0, a5, 1
+; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    srl a7, a7, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    srl a1, a7, a1
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a5, t0
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a6, a7, a6
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    srl a1, a3, a1
 ; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
 ; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    srli a1, a4, 40
@@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 3(a2)
 ; RV64I-NEXT:    srli a1, a4, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    or a1, a4, t1
-; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a3, 48
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a3, 40
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a3, 32
-; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a3, 16
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    srli a3, a6, 56
-; RV64I-NEXT:    sb a3, 23(a2)
-; RV64I-NEXT:    srli a1, a1, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 59(sp)
-; RV32I-NEXT:    sb a3, 58(sp)
-; RV32I-NEXT:    sb a4, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a5, 55(sp)
-; RV32I-NEXT:    sb a6, 54(sp)
-; RV32I-NEXT:    sb a7, 53(sp)
-; RV32I-NEXT:    sb s10, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
-; RV32I-NEXT:    sb zero, 91(sp)
-; RV32I-NEXT:    sb zero, 90(sp)
-; RV32I-NEXT:    sb zero, 89(sp)
-; RV32I-NEXT:    sb zero, 88(sp)
-; RV32I-NEXT:    sb zero, 87(sp)
-; RV32I-NEXT:    sb zero, 86(sp)
-; RV32I-NEXT:    sb zero, 85(sp)
-; RV32I-NEXT:    sb zero, 84(sp)
-; RV32I-NEXT:    sb zero, 83(sp)
-; RV32I-NEXT:    sb zero, 82(sp)
-; RV32I-NEXT:    sb zero, 81(sp)
-; RV32I-NEXT:    sb zero, 80(sp)
-; RV32I-NEXT:    sb zero, 79(sp)
-; RV32I-NEXT:    sb zero, 78(sp)
-; RV32I-NEXT:    sb zero, 77(sp)
-; RV32I-NEXT:    sb zero, 76(sp)
-; RV32I-NEXT:    sb zero, 75(sp)
-; RV32I-NEXT:    sb zero, 74(sp)
-; RV32I-NEXT:    sb zero, 73(sp)
-; RV32I-NEXT:    sb zero, 72(sp)
-; RV32I-NEXT:    sb zero, 71(sp)
-; RV32I-NEXT:    sb zero, 70(sp)
-; RV32I-NEXT:    sb zero, 69(sp)
-; RV32I-NEXT:    sb zero, 68(sp)
-; RV32I-NEXT:    sb zero, 67(sp)
-; RV32I-NEXT:    sb zero, 66(sp)
-; RV32I-NEXT:    sb zero, 65(sp)
-; RV32I-NEXT:    sb zero, 64(sp)
-; RV32I-NEXT:    sb zero, 63(sp)
-; RV32I-NEXT:    sb zero, 62(sp)
-; RV32I-NEXT:    sb zero, 61(sp)
-; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb s3, 43(sp)
-; RV32I-NEXT:    sb s2, 42(sp)
-; RV32I-NEXT:    sb s1, 41(sp)
-; RV32I-NEXT:    sb s0, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, t0, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t0, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    srl a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    add a4, a1, a0
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    lw a5, 8(a4)
+; RV32I-NEXT:    andi a3, a7, 31
+; RV32I-NEXT:    xori a6, a3, 31
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 16(a4)
+; RV32I-NEXT:    sll a1, a1, a6
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srl a3, t0, a7
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or a3, a3, t2
+; RV32I-NEXT:    srl a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 24(a4)
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    srl t0, t2, a7
+; RV32I-NEXT:    slli t4, t3, 1
+; RV32I-NEXT:    sll t4, t4, a6
+; RV32I-NEXT:    or t0, t0, t4
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    srl t2, t3, a7
+; RV32I-NEXT:    slli t3, a4, 1
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    srl a4, a4, a7
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb t0, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, a6, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a4, a6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    sb a0, 116(sp)
-; RV64I-NEXT:    sb a5, 115(sp)
-; RV64I-NEXT:    sb a6, 114(sp)
-; RV64I-NEXT:    sb a7, 113(sp)
-; RV64I-NEXT:    sb s11, 112(sp)
-; RV64I-NEXT:    sb s10, 111(sp)
-; RV64I-NEXT:    sb ra, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
-; RV64I-NEXT:    sb zero, 87(sp)
-; RV64I-NEXT:    sb zero, 86(sp)
-; RV64I-NEXT:    sb zero, 85(sp)
-; RV64I-NEXT:    sb zero, 84(sp)
-; RV64I-NEXT:    sb zero, 83(sp)
-; RV64I-NEXT:    sb zero, 82(sp)
-; RV64I-NEXT:    sb zero, 81(sp)
-; RV64I-NEXT:    sb zero, 80(sp)
-; RV64I-NEXT:    sb zero, 79(sp)
-; RV64I-NEXT:    sb zero, 78(sp)
-; RV64I-NEXT:    sb zero, 77(sp)
-; RV64I-NEXT:    sb zero, 76(sp)
-; RV64I-NEXT:    sb zero, 75(sp)
-; RV64I-NEXT:    sb zero, 74(sp)
-; RV64I-NEXT:    sb zero, 73(sp)
-; RV64I-NEXT:    sb zero, 72(sp)
-; RV64I-NEXT:    sb zero, 71(sp)
-; RV64I-NEXT:    sb zero, 70(sp)
-; RV64I-NEXT:    sb zero, 69(sp)
-; RV64I-NEXT:    sb zero, 68(sp)
-; RV64I-NEXT:    sb zero, 67(sp)
-; RV64I-NEXT:    sb zero, 66(sp)
-; RV64I-NEXT:    sb zero, 65(sp)
-; RV64I-NEXT:    sb zero, 64(sp)
-; RV64I-NEXT:    sb zero, 63(sp)
-; RV64I-NEXT:    sb zero, 62(sp)
-; RV64I-NEXT:    sb zero, 61(sp)
-; RV64I-NEXT:    sb zero, 60(sp)
-; RV64I-NEXT:    sb zero, 59(sp)
-; RV64I-NEXT:    sb zero, 58(sp)
-; RV64I-NEXT:    sb zero, 57(sp)
-; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t2, 95(sp)
-; RV64I-NEXT:    sb t1, 94(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 93(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 92(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 91(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 90(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    slli a0, t0, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a1, sp, 88
-; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    lbu a3, 8(a0)
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a5, 11(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a3, 13(a0)
-; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu a6, 15(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a3, a1
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t0, 27(a0)
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t0, 30(a0)
-; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t0, 18(a0)
-; RV64I-NEXT:    lbu t1, 19(a0)
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    srli t0, a4, 1
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or t1, a0, t1
-; RV64I-NEXT:    xori t2, a1, 63
-; RV64I-NEXT:    srl a0, t0, t2
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    srli a7, a6, 1
-; RV64I-NEXT:    srl a7, a7, t2
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    sll a0, a4, a1
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    srli a7, a5, 1
+; RV64I-NEXT:    ld t0, 24(a3)
+; RV64I-NEXT:    ld a3, 16(a3)
+; RV64I-NEXT:    srl a7, a7, a6
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    sll a7, t0, a1
 ; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    not t1, a1
-; RV64I-NEXT:    srl t0, t0, t1
+; RV64I-NEXT:    srl t0, t0, a6
+; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    sll a5, a5, a1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    sll a1, a4, a1
-; RV64I-NEXT:    srli a4, a6, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a6, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a6, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a6, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a6, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    or a4, a6, t0
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 30(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 27(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    or a6, a5, a7
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 6(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a6
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    sll a1, a5, a1
 ; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a4, a1, 56
+; RV64I-NEXT:    sb a4, 7(a2)
+; RV64I-NEXT:    srli a4, a1, 48
+; RV64I-NEXT:    sb a4, 6(a2)
+; RV64I-NEXT:    srli a4, a1, 40
+; RV64I-NEXT:    sb a4, 5(a2)
+; RV64I-NEXT:    srli a4, a1, 32
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    srli a4, a1, 24
+; RV64I-NEXT:    sb a4, 3(a2)
+; RV64I-NEXT:    srli a4, a1, 16
+; RV64I-NEXT:    sb a4, 2(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb a3, 16(a2)
+; RV64I-NEXT:    sb a7, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    srli a1, a3, 56
-; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    sb a1, 23(a2)
 ; RV64I-NEXT:    srli a1, a3, 48
-; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
 ; RV64I-NEXT:    srli a1, a3, 40
-; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    sb a1, 21(a2)
 ; RV64I-NEXT:    srli a1, a3, 32
-; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    sb a1, 20(a2)
 ; RV64I-NEXT:    srli a1, a3, 24
-; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    sb a1, 19(a2)
 ; RV64I-NEXT:    srli a1, a3, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sb a1, 18(a2)
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a6, 24(a2)
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    sb a3, 17(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    sb a4, 89(sp)
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a5, 87(sp)
-; RV32I-NEXT:    sb a6, 86(sp)
-; RV32I-NEXT:    sb a7, 85(sp)
-; RV32I-NEXT:    sb s10, 84(sp)
-; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s11, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
-; RV32I-NEXT:    sb zero, 59(sp)
-; RV32I-NEXT:    sb zero, 58(sp)
-; RV32I-NEXT:    sb zero, 57(sp)
-; RV32I-NEXT:    sb zero, 56(sp)
-; RV32I-NEXT:    sb zero, 55(sp)
-; RV32I-NEXT:    sb zero, 54(sp)
-; RV32I-NEXT:    sb zero, 53(sp)
-; RV32I-NEXT:    sb zero, 52(sp)
-; RV32I-NEXT:    sb zero, 51(sp)
-; RV32I-NEXT:    sb zero, 50(sp)
-; RV32I-NEXT:    sb zero, 49(sp)
-; RV32I-NEXT:    sb zero, 48(sp)
-; RV32I-NEXT:    sb zero, 47(sp)
-; RV32I-NEXT:    sb zero, 46(sp)
-; RV32I-NEXT:    sb zero, 45(sp)
-; RV32I-NEXT:    sb zero, 44(sp)
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb s3, 75(sp)
-; RV32I-NEXT:    sb s2, 74(sp)
-; RV32I-NEXT:    sb s1, 73(sp)
-; RV32I-NEXT:    sb s0, 72(sp)
-; RV32I-NEXT:    sb t6, 71(sp)
-; RV32I-NEXT:    sb t5, 70(sp)
-; RV32I-NEXT:    sb t4, 69(sp)
-; RV32I-NEXT:    sb t3, 68(sp)
-; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, t0, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 60
-; RV32I-NEXT:    sub a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a1, t0, 7
-; RV32I-NEXT:    lbu a0, 1(a4)
-; RV32I-NEXT:    lbu a3, 0(a4)
-; RV32I-NEXT:    lbu a5, 2(a4)
-; RV32I-NEXT:    lbu a6, 3(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a6, a5
-; RV32I-NEXT:    or a6, a3, a0
-; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori a7, a1, 31
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    lbu a3, 13(a4)
-; RV32I-NEXT:    lbu a5, 12(a4)
-; RV32I-NEXT:    lbu t0, 14(a4)
-; RV32I-NEXT:    lbu t1, 15(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t1, t0
-; RV32I-NEXT:    or t0, a5, a3
-; RV32I-NEXT:    lbu a3, 9(a4)
-; RV32I-NEXT:    lbu a5, 8(a4)
-; RV32I-NEXT:    lbu t1, 10(a4)
-; RV32I-NEXT:    lbu t2, 11(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, t2, t1
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    srli a3, t1, 1
-; RV32I-NEXT:    srl a5, a3, a7
-; RV32I-NEXT:    srli t4, t5, 1
-; RV32I-NEXT:    not t2, a1
-; RV32I-NEXT:    lbu a3, 21(a4)
-; RV32I-NEXT:    lbu t3, 20(a4)
-; RV32I-NEXT:    lbu t6, 22(a4)
-; RV32I-NEXT:    lbu s0, 23(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t3, t3, a3
-; RV32I-NEXT:    lbu a3, 17(a4)
-; RV32I-NEXT:    lbu t6, 16(a4)
-; RV32I-NEXT:    lbu s0, 18(a4)
-; RV32I-NEXT:    lbu s1, 19(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a3
-; RV32I-NEXT:    lbu a3, 29(a4)
-; RV32I-NEXT:    lbu t6, 28(a4)
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu s2, 31(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    lbu s1, 25(a4)
-; RV32I-NEXT:    lbu s2, 24(a4)
-; RV32I-NEXT:    srl t4, t4, t2
-; RV32I-NEXT:    or t6, t6, a3
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s1, s2
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu a4, 27(a4)
-; RV32I-NEXT:    srli s2, s0, 1
-; RV32I-NEXT:    srl s2, s2, a7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    srl s1, s1, t2
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    srl a7, a3, a7
-; RV32I-NEXT:    srli a3, t3, 1
-; RV32I-NEXT:    srl t2, a3, t2
-; RV32I-NEXT:    sll a3, t5, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t3, t3, a1
-; RV32I-NEXT:    sll t5, s0, a1
-; RV32I-NEXT:    sll t6, t6, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll a1, a6, a1
-; RV32I-NEXT:    srli a6, a4, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a4, t2
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t2, 56(sp)
+; RV32I-NEXT:    sw t1, 52(sp)
+; RV32I-NEXT:    sw t0, 48(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    addi a1, sp, 32
+; RV32I-NEXT:    sub a4, a1, a0
+; RV32I-NEXT:    lw a3, 4(a4)
+; RV32I-NEXT:    lw a5, 0(a4)
+; RV32I-NEXT:    sll a0, a3, a7
+; RV32I-NEXT:    andi a1, a7, 31
+; RV32I-NEXT:    xori a6, a1, 31
+; RV32I-NEXT:    srli a1, a5, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 8(a4)
+; RV32I-NEXT:    srl a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a7
+; RV32I-NEXT:    srli t2, t1, 1
+; RV32I-NEXT:    srl t2, t2, a6
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 16(a4)
+; RV32I-NEXT:    srl a3, a3, a6
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    sll t1, t2, a7
+; RV32I-NEXT:    srli t4, t3, 1
+; RV32I-NEXT:    srl t4, t4, a6
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    srli t0, t0, 1
+; RV32I-NEXT:    lw t4, 28(a4)
+; RV32I-NEXT:    lw a4, 24(a4)
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    sll t3, t4, a7
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    srl t4, t4, a6
+; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    srli t2, t2, 1
+; RV32I-NEXT:    srl a6, t2, a6
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    sll a5, a5, a7
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    srli a6, a5, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a5, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb a4, 24(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb t1, 20(a2)
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 27(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 26(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t6, 24
+; RV32I-NEXT:    srli a4, t3, 24
 ; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t6, 16
+; RV32I-NEXT:    srli a4, t3, 16
 ; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    or a4, t6, a7
-; RV32I-NEXT:    srli a7, t6, 8
-; RV32I-NEXT:    sb a7, 29(a2)
-; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 19(a2)
-; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    or a7, t5, s1
-; RV32I-NEXT:    srli t2, t5, 8
-; RV32I-NEXT:    sb t2, 17(a2)
-; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 23(a2)
-; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    or t2, t3, s2
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, t1, 24
-; RV32I-NEXT:    sb t3, 11(a2)
-; RV32I-NEXT:    srli t3, t1, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, t1, t4
-; RV32I-NEXT:    srli t1, t1, 8
-; RV32I-NEXT:    sb t1, 9(a2)
-; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    sb t1, 15(a2)
-; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    sb t1, 14(a2)
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    srli t0, a1, 16
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a4, t3, 8
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t2, 20(a2)
-; RV32I-NEXT:    sb t3, 8(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t3, 6(a0)
-; RV64I-NEXT:    lbu t4, 7(a0)
-; RV64I-NEXT:    lbu t5, 8(a0)
-; RV64I-NEXT:    lbu t6, 9(a0)
-; RV64I-NEXT:    lbu s0, 10(a0)
-; RV64I-NEXT:    lbu s1, 11(a0)
-; RV64I-NEXT:    lbu s2, 12(a0)
-; RV64I-NEXT:    lbu s3, 13(a0)
-; RV64I-NEXT:    lbu s4, 14(a0)
-; RV64I-NEXT:    lbu s5, 15(a0)
-; RV64I-NEXT:    lbu s6, 16(a0)
-; RV64I-NEXT:    lbu s7, 17(a0)
-; RV64I-NEXT:    lbu s8, 18(a0)
-; RV64I-NEXT:    lbu s9, 19(a0)
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or a3, s11, a3
-; RV64I-NEXT:    lbu s11, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s11
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t2, a1, a3
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a4, 28(a0)
-; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a4, 84(sp)
-; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb t0, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
-; RV64I-NEXT:    sb s9, 75(sp)
-; RV64I-NEXT:    sb s8, 74(sp)
-; RV64I-NEXT:    sb s7, 73(sp)
-; RV64I-NEXT:    sb s6, 72(sp)
-; RV64I-NEXT:    sb s5, 71(sp)
-; RV64I-NEXT:    sb s4, 70(sp)
-; RV64I-NEXT:    sb s3, 69(sp)
-; RV64I-NEXT:    sb s2, 68(sp)
-; RV64I-NEXT:    sb s1, 67(sp)
-; RV64I-NEXT:    sb s0, 66(sp)
-; RV64I-NEXT:    sb t6, 65(sp)
-; RV64I-NEXT:    sb t5, 64(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb t4, 63(sp)
-; RV64I-NEXT:    sb t3, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
-; RV64I-NEXT:    sb a0, 112(sp)
-; RV64I-NEXT:    sb a0, 104(sp)
-; RV64I-NEXT:    sb a0, 96(sp)
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    srli a6, a0, 24
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    srli a7, a0, 16
-; RV64I-NEXT:    sb a7, 114(sp)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 113(sp)
-; RV64I-NEXT:    sb a1, 111(sp)
-; RV64I-NEXT:    sb a3, 110(sp)
-; RV64I-NEXT:    sb a4, 109(sp)
-; RV64I-NEXT:    sb a5, 108(sp)
-; RV64I-NEXT:    sb a6, 107(sp)
-; RV64I-NEXT:    sb a7, 106(sp)
-; RV64I-NEXT:    sb a0, 105(sp)
-; RV64I-NEXT:    sb a1, 103(sp)
-; RV64I-NEXT:    sb a3, 102(sp)
-; RV64I-NEXT:    sb a4, 101(sp)
-; RV64I-NEXT:    sb a5, 100(sp)
-; RV64I-NEXT:    sb a6, 99(sp)
-; RV64I-NEXT:    sb a7, 98(sp)
-; RV64I-NEXT:    sb a0, 97(sp)
-; RV64I-NEXT:    sb a1, 95(sp)
-; RV64I-NEXT:    sb a3, 94(sp)
-; RV64I-NEXT:    sb a4, 93(sp)
-; RV64I-NEXT:    sb a5, 92(sp)
-; RV64I-NEXT:    sb a6, 91(sp)
-; RV64I-NEXT:    sb a7, 90(sp)
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    slli a0, t2, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    lbu a0, 9(a1)
-; RV64I-NEXT:    lbu a3, 8(a1)
-; RV64I-NEXT:    lbu a4, 10(a1)
-; RV64I-NEXT:    lbu a5, 11(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a3, 13(a1)
-; RV64I-NEXT:    lbu a4, 12(a1)
-; RV64I-NEXT:    lbu a5, 14(a1)
-; RV64I-NEXT:    lbu a6, 15(a1)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a4, a3, a0
-; RV64I-NEXT:    andi a3, t2, 7
-; RV64I-NEXT:    lbu a0, 17(a1)
-; RV64I-NEXT:    lbu a5, 16(a1)
-; RV64I-NEXT:    lbu a6, 18(a1)
-; RV64I-NEXT:    lbu a7, 19(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a1)
-; RV64I-NEXT:    lbu a6, 20(a1)
-; RV64I-NEXT:    lbu a7, 22(a1)
-; RV64I-NEXT:    lbu t0, 23(a1)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a3
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a1)
-; RV64I-NEXT:    lbu t0, 24(a1)
-; RV64I-NEXT:    lbu t1, 26(a1)
-; RV64I-NEXT:    lbu t2, 27(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a1)
-; RV64I-NEXT:    lbu t1, 28(a1)
-; RV64I-NEXT:    lbu t2, 30(a1)
-; RV64I-NEXT:    lbu a1, 31(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or t0, t0, t1
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
-; RV64I-NEXT:    slli t1, a4, 1
 ; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    xori t0, a3, 63
-; RV64I-NEXT:    sll t1, t1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll t0, a1, t0
-; RV64I-NEXT:    srl a1, a4, a3
-; RV64I-NEXT:    srl a4, a6, a3
-; RV64I-NEXT:    srl a5, a5, a3
-; RV64I-NEXT:    sra a3, a7, a3
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a5, t0
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    srl a0, a4, a1
+; RV64I-NEXT:    ld a5, 16(a3)
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    slli t0, a5, 1
+; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    srl a7, a7, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a6, a7, a6
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
 ; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    sb a1, 18(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a3, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a3, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a3, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a3, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 25(a2)
-; RV64I-NEXT:    srli a3, a4, 48
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    srli a3, a4, 40
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    srli a3, a4, 32
-; RV64I-NEXT:    sb a3, 4(a2)
-; RV64I-NEXT:    srli a3, a4, 24
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a3, a4, 16
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    or a3, a4, t1
-; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a6, 56
-; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a3, a3, 56
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or a3, a3, s11
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    or t1, a1, a3
-; RV32I-NEXT:    lbu t0, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a4, 28(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a4, 56(sp)
-; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb t0, 51(sp)
-; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
-; RV32I-NEXT:    sb s10, 48(sp)
-; RV32I-NEXT:    sb s9, 47(sp)
-; RV32I-NEXT:    sb s8, 46(sp)
-; RV32I-NEXT:    sb s7, 45(sp)
-; RV32I-NEXT:    sb s6, 44(sp)
-; RV32I-NEXT:    sb s5, 43(sp)
-; RV32I-NEXT:    sb t3, 59(sp)
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb s1, 39(sp)
-; RV32I-NEXT:    sb s0, 38(sp)
-; RV32I-NEXT:    sb t6, 37(sp)
-; RV32I-NEXT:    sb t5, 36(sp)
-; RV32I-NEXT:    sb t4, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t3, 31
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a0, 84(sp)
-; RV32I-NEXT:    sb a0, 80(sp)
-; RV32I-NEXT:    sb a0, 76(sp)
-; RV32I-NEXT:    sb a0, 72(sp)
-; RV32I-NEXT:    sb a0, 68(sp)
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a1, 87(sp)
-; RV32I-NEXT:    sb a3, 86(sp)
-; RV32I-NEXT:    sb a0, 85(sp)
-; RV32I-NEXT:    sb a1, 83(sp)
-; RV32I-NEXT:    sb a3, 82(sp)
-; RV32I-NEXT:    sb a0, 81(sp)
-; RV32I-NEXT:    sb a1, 79(sp)
-; RV32I-NEXT:    sb a3, 78(sp)
-; RV32I-NEXT:    sb a0, 77(sp)
-; RV32I-NEXT:    sb a1, 75(sp)
-; RV32I-NEXT:    sb a3, 74(sp)
-; RV32I-NEXT:    sb a0, 73(sp)
-; RV32I-NEXT:    sb a1, 71(sp)
-; RV32I-NEXT:    sb a3, 70(sp)
-; RV32I-NEXT:    sb a0, 69(sp)
-; RV32I-NEXT:    sb a1, 67(sp)
-; RV32I-NEXT:    sb a3, 66(sp)
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    sb a1, 63(sp)
-; RV32I-NEXT:    sb a3, 62(sp)
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    slli a0, t1, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t1, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    sra a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t3, t3, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t3, 28(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    add a4, a1, a0
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    lw a5, 8(a4)
+; RV32I-NEXT:    andi a3, a7, 31
+; RV32I-NEXT:    xori a6, a3, 31
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 16(a4)
+; RV32I-NEXT:    sll a1, a1, a6
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srl a3, t0, a7
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or a3, a3, t2
+; RV32I-NEXT:    srl a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 24(a4)
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    srl t0, t2, a7
+; RV32I-NEXT:    slli t4, t3, 1
+; RV32I-NEXT:    sll t4, t4, a6
+; RV32I-NEXT:    or t0, t0, t4
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    srl t2, t3, a7
+; RV32I-NEXT:    slli t3, a4, 1
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    sra a4, a4, a7
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb t0, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, a6, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a4, a6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1



More information about the llvm-commits mailing list