[llvm] [Codegen][LegalizeIntegerTypes] Improve shift through stack (PR #96151)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 3 03:51:35 PDT 2024


https://github.com/futog updated https://github.com/llvm/llvm-project/pull/96151

>From 0b8dea8d16aaeb8c423adf965af5f605cadd201e Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Thu, 20 Jun 2024 10:08:16 +0200
Subject: [PATCH 1/4] [Codegen][LegalizeIntegerTypes] Improve shift through
 stack

Minor improvement on cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.

If the target does not support unaligned memory access, use native
register aligment instead of byte alignment. The shift amount is also
splitted based on the native aligment, so load happens only from aligned
addresses.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   58 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  366 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 3119 +++++++-------
 .../RISCV/wide-scalar-shift-legalization.ll   | 3581 +++++++----------
 4 files changed, 3029 insertions(+), 4095 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a058b509b3aca..f21ed7581a5af 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4530,14 +4530,25 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   SDValue ShAmt = N->getOperand(1);
   EVT ShAmtVT = ShAmt.getValueType();
 
-  // This legalization is optimal when the shift is by a multiple of byte width,
-  //   %x * 8 <-> %x << 3   so 3 low bits should be be known zero.
-  bool ShiftByByteMultiple =
-      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+  EVT LoadStoreVT = VT;
+  do {
+      LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
+  }while (!TLI.isTypeLegal(LoadStoreVT));
+
+  const Align LoadStoreAlign = [&]() -> Align {
+      if (TLI.allowsMisalignedMemoryAccesses(LoadStoreVT))
+          return Align(1);
+
+      return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
+  }();
+
+  const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
+  const bool IsOneStepShift =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= Log2_32(ShiftUnitInBits);
 
   // If we can't do it as one step, we'll have two uses of shift amount,
   // and thus must freeze it.
-  if (!ShiftByByteMultiple)
+  if (!IsOneStepShift)
     ShAmt = DAG.getFreeze(ShAmt);
 
   unsigned VTBitWidth = VT.getScalarSizeInBits();
@@ -4551,8 +4562,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
 
   // Get a temporary stack slot 2x the width of our VT.
   // FIXME: reuse stack slots?
-  // FIXME: should we be more picky about alignment?
-  Align StackSlotAlignment(1);
+  Align StackSlotAlignment(LoadStoreAlign);
   SDValue StackPtr = DAG.CreateStackTemporary(
       TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
   EVT PtrTy = StackPtr.getValueType();
@@ -4577,16 +4587,22 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
 
   // Now, compute the full-byte offset into stack slot from where we can load.
-  // We have shift amount, which is in bits, but in multiples of byte.
-  // So just divide by CHAR_BIT.
+  // We have shift amount, which is in bits. Offset should point to an aligned
+  // address.
   SDNodeFlags Flags;
-  if (ShiftByByteMultiple)
+  if (IsOneStepShift)
     Flags.setExact(true);
-  SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
-                                   DAG.getConstant(3, dl, ShAmtVT), Flags);
+  SDValue OffsetInBits = DAG.getNode(ISD::SHL, dl, ShAmtVT,
+                                     DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags),
+                                     DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+  Flags.setExact(true);
+  SDValue Offset = DAG.getNode(
+      ISD::SRL, dl, ShAmtVT,
+      OffsetInBits,
+      DAG.getConstant(3, dl, ShAmtVT), Flags);
   // And clamp it, because OOB load is an immediate UB,
   // while shift overflow would have *just* been poison.
-  ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
+  Offset = DAG.getNode(ISD::AND, dl, ShAmtVT, Offset,
                            DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
   // We have exactly two strategies on indexing into stack slot here:
   // 1. upwards starting from the beginning of the slot
@@ -4603,23 +4619,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   } else {
     AdjStackPtr = DAG.getMemBasePlusOffset(
         StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
-    ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
+    Offset = DAG.getNegative(Offset, dl, ShAmtVT);
   }
 
   // Get the pointer somewhere into the stack slot from which we need to load.
-  ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
-  AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
+  Offset = DAG.getSExtOrTrunc(Offset, dl, PtrTy);
+  AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
 
   // And load it! While the load is not legal, legalizing it is obvious.
   SDValue Res = DAG.getLoad(
       VT, dl, Ch, AdjStackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
-  // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), LoadStoreAlign);
+  // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
 
-  // If we may still have a less-than-CHAR_BIT to shift by, do so now.
-  if (!ShiftByByteMultiple) {
+  // If we may still have a remaining bits to shift by, do so now.
+  if (!IsOneStepShift) {
     SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
-                                   DAG.getConstant(7, dl, ShAmtVT));
+                                   DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
     Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
   }
 
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed725..5ba8755201ddf 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb a1, 12(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    sb a4, 4(sp)
-; RV32I-NEXT:    sb a3, 0(sp)
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 14(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(sp)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 11(sp)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 10(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(sp)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(sp)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 3(sp)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 2(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    andi a6, a2, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 8(a1)
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a5, a7, 1
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, a7, a2
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
-; RV32I-NEXT:    sb a5, 4(sp)
-; RV32I-NEXT:    sb a1, 0(sp)
-; RV32I-NEXT:    srai a6, a3, 31
-; RV32I-NEXT:    sb a6, 28(sp)
-; RV32I-NEXT:    sb a6, 24(sp)
-; RV32I-NEXT:    sb a6, 20(sp)
-; RV32I-NEXT:    sb a6, 16(sp)
-; RV32I-NEXT:    srli a7, a3, 24
-; RV32I-NEXT:    sb a7, 15(sp)
-; RV32I-NEXT:    srli a7, a3, 16
-; RV32I-NEXT:    sb a7, 14(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(sp)
-; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 11(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 10(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(sp)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 7(sp)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 6(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 5(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(sp)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(sp)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    srli a3, a6, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw a3, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a1, 0(sp)
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a3, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    andi a6, a2, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 8(a1)
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a5, a7, 1
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, a7, a2
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb zero, 11(sp)
-; RV32I-NEXT:    sb zero, 10(sp)
-; RV32I-NEXT:    sb zero, 9(sp)
-; RV32I-NEXT:    sb zero, 8(sp)
-; RV32I-NEXT:    sb zero, 7(sp)
-; RV32I-NEXT:    sb zero, 6(sp)
-; RV32I-NEXT:    sb zero, 5(sp)
-; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb zero, 3(sp)
-; RV32I-NEXT:    sb zero, 2(sp)
-; RV32I-NEXT:    sb zero, 1(sp)
-; RV32I-NEXT:    sb zero, 0(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a5, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a3, 16(sp)
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 30(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 29(sp)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 26(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 25(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 22(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 18(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a1, a3, a1
-; RV32I-NEXT:    lbu a3, 5(a1)
-; RV32I-NEXT:    lbu a4, 4(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    sll a4, a3, a2
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu t0, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    srli a6, a5, 1
-; RV32I-NEXT:    xori a7, a2, 31
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lw a1, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a1, a2
+; RV32I-NEXT:    srli a6, a4, 1
+; RV32I-NEXT:    andi a7, a2, 31
+; RV32I-NEXT:    lw t0, 8(a3)
+; RV32I-NEXT:    xori a7, a7, 31
 ; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 9(a1)
-; RV32I-NEXT:    lbu t0, 8(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    sll t0, a6, a2
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not t1, a2
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    lbu t0, 13(a1)
-; RV32I-NEXT:    lbu t1, 12(a1)
-; RV32I-NEXT:    lbu t2, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sll a6, t0, a2
+; RV32I-NEXT:    lw a3, 12(a3)
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    srl a1, a1, a7
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    sll a3, a3, a2
+; RV32I-NEXT:    srli a6, t0, 1
 ; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    sll a2, a5, a2
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    sll a2, a4, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
-; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92b..0b87bb05cfd63 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb s2, 18(sp)
-; RV32I-NEXT:    sb s1, 17(sp)
-; RV32I-NEXT:    sb s0, 16(sp)
-; RV32I-NEXT:    sb t6, 15(sp)
-; RV32I-NEXT:    sb t5, 14(sp)
-; RV32I-NEXT:    sb t4, 13(sp)
-; RV32I-NEXT:    sb t3, 12(sp)
-; RV32I-NEXT:    sb t2, 11(sp)
-; RV32I-NEXT:    sb t1, 10(sp)
-; RV32I-NEXT:    sb t0, 9(sp)
-; RV32I-NEXT:    sb a7, 8(sp)
-; RV32I-NEXT:    sb a6, 7(sp)
-; RV32I-NEXT:    sb a5, 6(sp)
-; RV32I-NEXT:    sb a4, 5(sp)
-; RV32I-NEXT:    sb a3, 4(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    addi a0, sp, 4
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or t0, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sll a6, t1, a6
+; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -942,98 +961,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb zero, 11(sp)
-; RV32I-NEXT:    sb zero, 10(sp)
-; RV32I-NEXT:    sb zero, 9(sp)
-; RV32I-NEXT:    sb zero, 8(sp)
-; RV32I-NEXT:    sb zero, 7(sp)
-; RV32I-NEXT:    sb zero, 6(sp)
-; RV32I-NEXT:    sb zero, 5(sp)
-; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb s2, 34(sp)
-; RV32I-NEXT:    sb s1, 33(sp)
-; RV32I-NEXT:    sb s0, 32(sp)
-; RV32I-NEXT:    sb t6, 31(sp)
-; RV32I-NEXT:    sb t5, 30(sp)
-; RV32I-NEXT:    sb t4, 29(sp)
-; RV32I-NEXT:    sb t3, 28(sp)
-; RV32I-NEXT:    sb t2, 27(sp)
-; RV32I-NEXT:    sb t1, 26(sp)
-; RV32I-NEXT:    sb t0, 25(sp)
-; RV32I-NEXT:    sb a7, 24(sp)
-; RV32I-NEXT:    sb a6, 23(sp)
-; RV32I-NEXT:    sb a5, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a3, 20(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    addi a0, sp, 20
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a0, a1
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    lw t0, 12(a3)
+; RV32I-NEXT:    lw a3, 8(a3)
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    sll t0, t0, a1
+; RV32I-NEXT:    srli t1, a3, 1
+; RV32I-NEXT:    srl t1, t1, a6
+; RV32I-NEXT:    or t1, t0, t1
+; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 7(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    srli a0, t1, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, t1, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1161,105 +1199,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu a0, 13(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 15(sp)
-; RV32I-NEXT:    sb s3, 14(sp)
-; RV32I-NEXT:    sb a0, 13(sp)
-; RV32I-NEXT:    sb s2, 12(sp)
-; RV32I-NEXT:    sb s1, 11(sp)
-; RV32I-NEXT:    sb s0, 10(sp)
-; RV32I-NEXT:    sb t6, 9(sp)
-; RV32I-NEXT:    sb t5, 8(sp)
-; RV32I-NEXT:    sb t4, 7(sp)
-; RV32I-NEXT:    sb t3, 6(sp)
-; RV32I-NEXT:    sb t2, 5(sp)
-; RV32I-NEXT:    sb t1, 4(sp)
-; RV32I-NEXT:    sb t0, 3(sp)
-; RV32I-NEXT:    sb a7, 2(sp)
-; RV32I-NEXT:    sb a6, 1(sp)
-; RV32I-NEXT:    sb a5, 0(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a4, 16(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a0, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or t0, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sll a6, t1, a6
+; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1272,441 +1323,429 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    lbu ra, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
-; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 87(sp)
-; RV64I-NEXT:    sb a4, 86(sp)
-; RV64I-NEXT:    sb a0, 85(sp)
-; RV64I-NEXT:    sb a5, 84(sp)
-; RV64I-NEXT:    sb a6, 83(sp)
-; RV64I-NEXT:    sb a7, 82(sp)
-; RV64I-NEXT:    sb zero, 119(sp)
-; RV64I-NEXT:    sb zero, 118(sp)
-; RV64I-NEXT:    sb zero, 117(sp)
-; RV64I-NEXT:    sb zero, 116(sp)
-; RV64I-NEXT:    sb zero, 115(sp)
-; RV64I-NEXT:    sb zero, 114(sp)
-; RV64I-NEXT:    sb zero, 113(sp)
-; RV64I-NEXT:    sb zero, 112(sp)
-; RV64I-NEXT:    sb zero, 111(sp)
-; RV64I-NEXT:    sb zero, 110(sp)
-; RV64I-NEXT:    sb zero, 109(sp)
-; RV64I-NEXT:    sb zero, 108(sp)
-; RV64I-NEXT:    sb zero, 107(sp)
-; RV64I-NEXT:    sb zero, 106(sp)
-; RV64I-NEXT:    sb zero, 105(sp)
-; RV64I-NEXT:    sb zero, 104(sp)
-; RV64I-NEXT:    sb zero, 103(sp)
-; RV64I-NEXT:    sb zero, 102(sp)
-; RV64I-NEXT:    sb zero, 101(sp)
-; RV64I-NEXT:    sb zero, 100(sp)
-; RV64I-NEXT:    sb zero, 99(sp)
-; RV64I-NEXT:    sb zero, 98(sp)
-; RV64I-NEXT:    sb zero, 97(sp)
-; RV64I-NEXT:    sb zero, 96(sp)
-; RV64I-NEXT:    sb zero, 95(sp)
-; RV64I-NEXT:    sb zero, 94(sp)
-; RV64I-NEXT:    sb zero, 93(sp)
-; RV64I-NEXT:    sb zero, 92(sp)
-; RV64I-NEXT:    sb zero, 91(sp)
-; RV64I-NEXT:    sb zero, 90(sp)
-; RV64I-NEXT:    sb zero, 89(sp)
-; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t0, 81(sp)
-; RV64I-NEXT:    sb ra, 80(sp)
-; RV64I-NEXT:    sb s11, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
-; RV64I-NEXT:    sb t3, 64(sp)
-; RV64I-NEXT:    sb t2, 63(sp)
-; RV64I-NEXT:    sb t1, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    andi a1, a1, 31
-; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    srl a5, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
-; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 59(sp)
-; RV32I-NEXT:    sb a4, 58(sp)
-; RV32I-NEXT:    sb a0, 57(sp)
-; RV32I-NEXT:    sb a5, 56(sp)
-; RV32I-NEXT:    sb a6, 55(sp)
-; RV32I-NEXT:    sb a7, 54(sp)
-; RV32I-NEXT:    sb zero, 91(sp)
-; RV32I-NEXT:    sb zero, 90(sp)
-; RV32I-NEXT:    sb zero, 89(sp)
-; RV32I-NEXT:    sb zero, 88(sp)
-; RV32I-NEXT:    sb zero, 87(sp)
-; RV32I-NEXT:    sb zero, 86(sp)
-; RV32I-NEXT:    sb zero, 85(sp)
-; RV32I-NEXT:    sb zero, 84(sp)
-; RV32I-NEXT:    sb zero, 83(sp)
-; RV32I-NEXT:    sb zero, 82(sp)
-; RV32I-NEXT:    sb zero, 81(sp)
-; RV32I-NEXT:    sb zero, 80(sp)
-; RV32I-NEXT:    sb zero, 79(sp)
-; RV32I-NEXT:    sb zero, 78(sp)
-; RV32I-NEXT:    sb zero, 77(sp)
-; RV32I-NEXT:    sb zero, 76(sp)
-; RV32I-NEXT:    sb zero, 75(sp)
-; RV32I-NEXT:    sb zero, 74(sp)
-; RV32I-NEXT:    sb zero, 73(sp)
-; RV32I-NEXT:    sb zero, 72(sp)
-; RV32I-NEXT:    sb zero, 71(sp)
-; RV32I-NEXT:    sb zero, 70(sp)
-; RV32I-NEXT:    sb zero, 69(sp)
-; RV32I-NEXT:    sb zero, 68(sp)
-; RV32I-NEXT:    sb zero, 67(sp)
-; RV32I-NEXT:    sb zero, 66(sp)
-; RV32I-NEXT:    sb zero, 65(sp)
-; RV32I-NEXT:    sb zero, 64(sp)
-; RV32I-NEXT:    sb zero, 63(sp)
-; RV32I-NEXT:    sb zero, 62(sp)
-; RV32I-NEXT:    sb zero, 61(sp)
-; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb t0, 53(sp)
-; RV32I-NEXT:    sb ra, 52(sp)
-; RV32I-NEXT:    sb s11, 51(sp)
-; RV32I-NEXT:    sb s10, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
-; RV32I-NEXT:    sb s3, 43(sp)
-; RV32I-NEXT:    sb s2, 42(sp)
-; RV32I-NEXT:    sb s1, 41(sp)
-; RV32I-NEXT:    sb s0, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 28
-; RV32I-NEXT:    add a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
-; RV32I-NEXT:    ret
-  %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a5, a3, a0
+; RV32I-NEXT:    lw a3, 4(a5)
+; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    srl a4, a3, a6
+; RV32I-NEXT:    lw a7, 8(a5)
+; RV32I-NEXT:    andi a0, a6, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a1, 0(a5)
+; RV32I-NEXT:    slli a0, a7, 1
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srl t1, a1, a6
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw t2, 12(a5)
+; RV32I-NEXT:    lw t3, 16(a5)
+; RV32I-NEXT:    sll a1, a3, t0
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    slli a3, t3, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    srl s0, t5, a6
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    srl t3, t3, a6
+; RV32I-NEXT:    slli t5, t5, 1
+; RV32I-NEXT:    lw a5, 28(a5)
+; RV32I-NEXT:    sll t5, t5, t0
+; RV32I-NEXT:    or t5, t3, t5
+; RV32I-NEXT:    srl t6, t6, a6
+; RV32I-NEXT:    slli s2, a5, 1
+; RV32I-NEXT:    sll t0, s2, t0
+; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    sb t3, 16(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 31(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, s1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, s1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
   %res = lshr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
@@ -1715,438 +1754,426 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    lbu ra, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
-; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 119(sp)
-; RV64I-NEXT:    sb a4, 118(sp)
-; RV64I-NEXT:    sb a0, 117(sp)
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    sb a7, 114(sp)
-; RV64I-NEXT:    sb zero, 87(sp)
-; RV64I-NEXT:    sb zero, 86(sp)
-; RV64I-NEXT:    sb zero, 85(sp)
-; RV64I-NEXT:    sb zero, 84(sp)
-; RV64I-NEXT:    sb zero, 83(sp)
-; RV64I-NEXT:    sb zero, 82(sp)
-; RV64I-NEXT:    sb zero, 81(sp)
-; RV64I-NEXT:    sb zero, 80(sp)
-; RV64I-NEXT:    sb zero, 79(sp)
-; RV64I-NEXT:    sb zero, 78(sp)
-; RV64I-NEXT:    sb zero, 77(sp)
-; RV64I-NEXT:    sb zero, 76(sp)
-; RV64I-NEXT:    sb zero, 75(sp)
-; RV64I-NEXT:    sb zero, 74(sp)
-; RV64I-NEXT:    sb zero, 73(sp)
-; RV64I-NEXT:    sb zero, 72(sp)
-; RV64I-NEXT:    sb zero, 71(sp)
-; RV64I-NEXT:    sb zero, 70(sp)
-; RV64I-NEXT:    sb zero, 69(sp)
-; RV64I-NEXT:    sb zero, 68(sp)
-; RV64I-NEXT:    sb zero, 67(sp)
-; RV64I-NEXT:    sb zero, 66(sp)
-; RV64I-NEXT:    sb zero, 65(sp)
-; RV64I-NEXT:    sb zero, 64(sp)
-; RV64I-NEXT:    sb zero, 63(sp)
-; RV64I-NEXT:    sb zero, 62(sp)
-; RV64I-NEXT:    sb zero, 61(sp)
-; RV64I-NEXT:    sb zero, 60(sp)
-; RV64I-NEXT:    sb zero, 59(sp)
-; RV64I-NEXT:    sb zero, 58(sp)
-; RV64I-NEXT:    sb zero, 57(sp)
-; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t0, 113(sp)
-; RV64I-NEXT:    sb ra, 112(sp)
-; RV64I-NEXT:    sb s11, 111(sp)
-; RV64I-NEXT:    sb s10, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
-; RV64I-NEXT:    sb t2, 95(sp)
-; RV64I-NEXT:    sb t1, 94(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 93(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 92(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 91(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 90(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    andi a1, a1, 31
-; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    sll a6, a4, a1
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    srli a0, a5, 1
+; RV64I-NEXT:    ld t0, 24(a3)
+; RV64I-NEXT:    ld a3, 16(a3)
+; RV64I-NEXT:    srl a0, a0, a7
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    sll t0, t0, a1
+; RV64I-NEXT:    srli t1, a3, 1
+; RV64I-NEXT:    srl t1, t1, a7
+; RV64I-NEXT:    or t1, t0, t1
+; RV64I-NEXT:    sll a3, a3, a1
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a7
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    sll a1, a5, a1
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a3, t0, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    srli a1, a6, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb t1, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a1, t1, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, t1, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, t1, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, t1, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, t1, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a1, t1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
-; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 91(sp)
-; RV32I-NEXT:    sb a4, 90(sp)
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a5, 88(sp)
-; RV32I-NEXT:    sb a6, 87(sp)
-; RV32I-NEXT:    sb a7, 86(sp)
-; RV32I-NEXT:    sb zero, 59(sp)
-; RV32I-NEXT:    sb zero, 58(sp)
-; RV32I-NEXT:    sb zero, 57(sp)
-; RV32I-NEXT:    sb zero, 56(sp)
-; RV32I-NEXT:    sb zero, 55(sp)
-; RV32I-NEXT:    sb zero, 54(sp)
-; RV32I-NEXT:    sb zero, 53(sp)
-; RV32I-NEXT:    sb zero, 52(sp)
-; RV32I-NEXT:    sb zero, 51(sp)
-; RV32I-NEXT:    sb zero, 50(sp)
-; RV32I-NEXT:    sb zero, 49(sp)
-; RV32I-NEXT:    sb zero, 48(sp)
-; RV32I-NEXT:    sb zero, 47(sp)
-; RV32I-NEXT:    sb zero, 46(sp)
-; RV32I-NEXT:    sb zero, 45(sp)
-; RV32I-NEXT:    sb zero, 44(sp)
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb t0, 85(sp)
-; RV32I-NEXT:    sb ra, 84(sp)
-; RV32I-NEXT:    sb s11, 83(sp)
-; RV32I-NEXT:    sb s10, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
-; RV32I-NEXT:    sb s3, 75(sp)
-; RV32I-NEXT:    sb s2, 74(sp)
-; RV32I-NEXT:    sb s1, 73(sp)
-; RV32I-NEXT:    sb s0, 72(sp)
-; RV32I-NEXT:    sb t6, 71(sp)
-; RV32I-NEXT:    sb t5, 70(sp)
-; RV32I-NEXT:    sb t4, 69(sp)
-; RV32I-NEXT:    sb t3, 68(sp)
-; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a3, 36(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 36
+; RV32I-NEXT:    sub a6, a3, a0
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    slli a7, a1, 3
+; RV32I-NEXT:    lw t0, 0(a6)
+; RV32I-NEXT:    sll a4, a3, a7
+; RV32I-NEXT:    andi a0, a7, 24
+; RV32I-NEXT:    xori t1, a0, 31
+; RV32I-NEXT:    srli a0, t0, 1
+; RV32I-NEXT:    lw t2, 12(a6)
+; RV32I-NEXT:    lw a5, 8(a6)
+; RV32I-NEXT:    srl a0, a0, t1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sll t3, t2, a7
+; RV32I-NEXT:    srli a1, a5, 1
+; RV32I-NEXT:    srl a1, a1, t1
+; RV32I-NEXT:    or a1, t3, a1
+; RV32I-NEXT:    sll t4, a5, a7
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    lw t5, 20(a6)
+; RV32I-NEXT:    lw t6, 16(a6)
+; RV32I-NEXT:    srl a3, a3, t1
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    sll s0, t5, a7
+; RV32I-NEXT:    srli a5, t6, 1
+; RV32I-NEXT:    srl a5, a5, t1
+; RV32I-NEXT:    or a5, s0, a5
+; RV32I-NEXT:    sll t6, t6, a7
+; RV32I-NEXT:    srli t2, t2, 1
+; RV32I-NEXT:    lw s1, 28(a6)
+; RV32I-NEXT:    lw a6, 24(a6)
+; RV32I-NEXT:    srl t2, t2, t1
+; RV32I-NEXT:    or t2, t6, t2
+; RV32I-NEXT:    sll s1, s1, a7
+; RV32I-NEXT:    srli s2, a6, 1
+; RV32I-NEXT:    srl s2, s2, t1
+; RV32I-NEXT:    or s2, s1, s2
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    srli t5, t5, 1
+; RV32I-NEXT:    srl t1, t5, t1
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    sll a7, t0, a7
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    srli a6, a6, 24
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    srli s1, s1, 24
+; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    srli a6, t6, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli s0, s0, 24
+; RV32I-NEXT:    sb s0, 23(a2)
+; RV32I-NEXT:    srli a6, t4, 24
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    srli a6, t3, 24
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    sb t1, 24(a2)
+; RV32I-NEXT:    sb s2, 28(a2)
+; RV32I-NEXT:    sb t2, 16(a2)
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, s2, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a4, s2, 8
 ; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2158,454 +2185,428 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t0, a1
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 2(a0)
-; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 3(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 4(a0)
-; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 5(a0)
-; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    lbu t4, 8(a0)
-; RV64I-NEXT:    lbu t5, 9(a0)
-; RV64I-NEXT:    lbu t6, 10(a0)
-; RV64I-NEXT:    lbu s0, 11(a0)
-; RV64I-NEXT:    lbu s1, 12(a0)
-; RV64I-NEXT:    lbu s2, 13(a0)
-; RV64I-NEXT:    lbu s3, 14(a0)
-; RV64I-NEXT:    lbu s4, 15(a0)
-; RV64I-NEXT:    lbu s5, 16(a0)
-; RV64I-NEXT:    lbu s6, 17(a0)
-; RV64I-NEXT:    lbu s7, 18(a0)
-; RV64I-NEXT:    lbu s8, 19(a0)
-; RV64I-NEXT:    lbu s9, 20(a0)
-; RV64I-NEXT:    lbu s10, 21(a0)
-; RV64I-NEXT:    lbu s11, 22(a0)
-; RV64I-NEXT:    lbu ra, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a4, 27(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t0, 0(t0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a4, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb ra, 79(sp)
-; RV64I-NEXT:    sb s11, 78(sp)
-; RV64I-NEXT:    sb s10, 77(sp)
-; RV64I-NEXT:    sb s9, 76(sp)
-; RV64I-NEXT:    sb s8, 75(sp)
-; RV64I-NEXT:    sb s7, 74(sp)
-; RV64I-NEXT:    sb s6, 73(sp)
-; RV64I-NEXT:    sb s5, 72(sp)
-; RV64I-NEXT:    sb s4, 71(sp)
-; RV64I-NEXT:    sb s3, 70(sp)
-; RV64I-NEXT:    sb s2, 69(sp)
-; RV64I-NEXT:    sb s1, 68(sp)
-; RV64I-NEXT:    sb s0, 67(sp)
-; RV64I-NEXT:    sb t6, 66(sp)
-; RV64I-NEXT:    sb t5, 65(sp)
-; RV64I-NEXT:    sb t4, 64(sp)
-; RV64I-NEXT:    sb t3, 63(sp)
-; RV64I-NEXT:    sb t2, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
-; RV64I-NEXT:    sb a0, 112(sp)
-; RV64I-NEXT:    sb a0, 104(sp)
-; RV64I-NEXT:    sb a0, 96(sp)
-; RV64I-NEXT:    sb a0, 88(sp)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or t0, t0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    srl a5, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
 ; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    srli a6, a0, 24
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    srli a7, a0, 16
-; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
 ; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 113(sp)
-; RV64I-NEXT:    sb a1, 111(sp)
-; RV64I-NEXT:    sb a3, 110(sp)
-; RV64I-NEXT:    sb a4, 109(sp)
-; RV64I-NEXT:    sb a5, 108(sp)
-; RV64I-NEXT:    sb a6, 107(sp)
-; RV64I-NEXT:    sb a7, 106(sp)
-; RV64I-NEXT:    sb a0, 105(sp)
-; RV64I-NEXT:    sb a1, 103(sp)
-; RV64I-NEXT:    sb a3, 102(sp)
-; RV64I-NEXT:    sb a4, 101(sp)
-; RV64I-NEXT:    sb a5, 100(sp)
-; RV64I-NEXT:    sb a6, 99(sp)
-; RV64I-NEXT:    sb a7, 98(sp)
-; RV64I-NEXT:    sb a0, 97(sp)
-; RV64I-NEXT:    sb a1, 95(sp)
-; RV64I-NEXT:    sb a3, 94(sp)
-; RV64I-NEXT:    sb a4, 93(sp)
-; RV64I-NEXT:    sb a5, 92(sp)
-; RV64I-NEXT:    sb a6, 91(sp)
-; RV64I-NEXT:    sb a7, 90(sp)
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    andi a0, t0, 31
-; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a6, a1, a0
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    lbu t1, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 3(a0)
-; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 4(a0)
-; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t3, 7(a0)
-; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s6, 17(a0)
-; RV32I-NEXT:    lbu s7, 18(a0)
-; RV32I-NEXT:    lbu s8, 19(a0)
-; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 21(a0)
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a4, 27(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t0, 0(t0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a4, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb t1, 59(sp)
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s10, 49(sp)
-; RV32I-NEXT:    sb s9, 48(sp)
-; RV32I-NEXT:    sb s8, 47(sp)
-; RV32I-NEXT:    sb s7, 46(sp)
-; RV32I-NEXT:    sb s6, 45(sp)
-; RV32I-NEXT:    sb s5, 44(sp)
-; RV32I-NEXT:    sb s4, 43(sp)
-; RV32I-NEXT:    sb s3, 42(sp)
-; RV32I-NEXT:    sb s2, 41(sp)
-; RV32I-NEXT:    sb s1, 40(sp)
-; RV32I-NEXT:    sb s0, 39(sp)
-; RV32I-NEXT:    sb t6, 38(sp)
-; RV32I-NEXT:    sb t5, 37(sp)
-; RV32I-NEXT:    sb t4, 36(sp)
-; RV32I-NEXT:    sb t3, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t1, 31
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a0, 84(sp)
-; RV32I-NEXT:    sb a0, 80(sp)
-; RV32I-NEXT:    sb a0, 76(sp)
-; RV32I-NEXT:    sb a0, 72(sp)
-; RV32I-NEXT:    sb a0, 68(sp)
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a1, 87(sp)
-; RV32I-NEXT:    sb a3, 86(sp)
-; RV32I-NEXT:    sb a0, 85(sp)
-; RV32I-NEXT:    sb a1, 83(sp)
-; RV32I-NEXT:    sb a3, 82(sp)
-; RV32I-NEXT:    sb a0, 81(sp)
-; RV32I-NEXT:    sb a1, 79(sp)
-; RV32I-NEXT:    sb a3, 78(sp)
-; RV32I-NEXT:    sb a0, 77(sp)
-; RV32I-NEXT:    sb a1, 75(sp)
-; RV32I-NEXT:    sb a3, 74(sp)
-; RV32I-NEXT:    sb a0, 73(sp)
-; RV32I-NEXT:    sb a1, 71(sp)
-; RV32I-NEXT:    sb a3, 70(sp)
-; RV32I-NEXT:    sb a0, 69(sp)
-; RV32I-NEXT:    sb a1, 67(sp)
-; RV32I-NEXT:    sb a3, 66(sp)
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    sb a1, 63(sp)
-; RV32I-NEXT:    sb a3, 62(sp)
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    andi a0, t0, 31
-; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a6, a1, a0
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a1, a1, t3
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    add a5, a3, a0
+; RV32I-NEXT:    lw a3, 4(a5)
+; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    srl a4, a3, a6
+; RV32I-NEXT:    lw a7, 8(a5)
+; RV32I-NEXT:    andi a0, a6, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a1, 0(a5)
+; RV32I-NEXT:    slli a0, a7, 1
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srl t1, a1, a6
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw t2, 12(a5)
+; RV32I-NEXT:    lw t3, 16(a5)
+; RV32I-NEXT:    sll a1, a3, t0
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    slli a3, t3, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    srl s0, t5, a6
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    srl t3, t3, a6
+; RV32I-NEXT:    slli t5, t5, 1
+; RV32I-NEXT:    lw a5, 28(a5)
+; RV32I-NEXT:    sll t5, t5, t0
+; RV32I-NEXT:    or t5, t3, t5
+; RV32I-NEXT:    srl t6, t6, a6
+; RV32I-NEXT:    slli s2, a5, 1
+; RV32I-NEXT:    sll t0, s2, t0
+; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    sra a5, a5, a6
+; RV32I-NEXT:    sb t6, 24(a2)
 ; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb t3, 16(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
 ; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 31(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, s1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, s1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afa..7e879b137b4f0 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb s4, 26(sp)
-; RV32I-NEXT:    sb s3, 25(sp)
-; RV32I-NEXT:    sb s2, 24(sp)
-; RV32I-NEXT:    sb t6, 23(sp)
-; RV32I-NEXT:    sb t5, 22(sp)
-; RV32I-NEXT:    sb t4, 21(sp)
-; RV32I-NEXT:    sb t3, 20(sp)
-; RV32I-NEXT:    sb t2, 19(sp)
-; RV32I-NEXT:    sb t1, 18(sp)
-; RV32I-NEXT:    sb t0, 17(sp)
-; RV32I-NEXT:    sb a7, 16(sp)
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    sb a5, 14(sp)
-; RV32I-NEXT:    sb a4, 13(sp)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 12
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a4, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli a7, a0, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a5, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb a0, 43(sp)
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    sb t0, 33(sp)
-; RV32I-NEXT:    sb a7, 32(sp)
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    sb a5, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a3, 28(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 28
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    sll a0, a5, a4
-; RV32I-NEXT:    lbu a1, 1(a3)
-; RV32I-NEXT:    lbu a6, 0(a3)
-; RV32I-NEXT:    lbu a7, 2(a3)
-; RV32I-NEXT:    lbu t0, 3(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    xori a7, a4, 31
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu t0, 13(a3)
-; RV32I-NEXT:    lbu t1, 12(a3)
-; RV32I-NEXT:    lbu t2, 14(a3)
-; RV32I-NEXT:    lbu t3, 15(a3)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t0, t0, a4
-; RV32I-NEXT:    lbu t1, 9(a3)
-; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 10(a3)
-; RV32I-NEXT:    lbu a3, 11(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl a7, t1, a7
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a3, a3, a4
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    not t1, a4
-; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, a3, a5
-; RV32I-NEXT:    sll a4, a6, a4
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    srli a3, t0, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, t0, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a0, a1
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    lw t0, 12(a3)
+; RV32I-NEXT:    lw a3, 8(a3)
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    sll a7, t0, a1
+; RV32I-NEXT:    srli t0, a3, 1
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a1, 16
 ; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    srli a3, a1, 24
 ; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    sb a3, 7(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
 ; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 11(a0)
-; RV32I-NEXT:    lbu s4, 12(a0)
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    lbu s2, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu a0, 14(a0)
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s2
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sb a3, 23(sp)
-; RV32I-NEXT:    sb a0, 22(sp)
-; RV32I-NEXT:    sb s5, 21(sp)
-; RV32I-NEXT:    sb s4, 20(sp)
-; RV32I-NEXT:    sb s3, 19(sp)
-; RV32I-NEXT:    sb s0, 18(sp)
-; RV32I-NEXT:    sb t6, 17(sp)
-; RV32I-NEXT:    sb t5, 16(sp)
-; RV32I-NEXT:    sb t4, 15(sp)
-; RV32I-NEXT:    sb t3, 14(sp)
-; RV32I-NEXT:    sb t2, 13(sp)
-; RV32I-NEXT:    sb t1, 12(sp)
-; RV32I-NEXT:    sb t0, 11(sp)
-; RV32I-NEXT:    sb a7, 10(sp)
-; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 36(sp)
-; RV32I-NEXT:    sb a4, 32(sp)
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 39(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 38(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb a3, 34(sp)
-; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 8
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sra a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a4, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli a7, a0, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a5, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 87(sp)
-; RV64I-NEXT:    sb a3, 86(sp)
-; RV64I-NEXT:    sb a4, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a5, 83(sp)
-; RV64I-NEXT:    sb a6, 82(sp)
-; RV64I-NEXT:    sb a7, 81(sp)
-; RV64I-NEXT:    sb s11, 80(sp)
-; RV64I-NEXT:    sb s10, 79(sp)
-; RV64I-NEXT:    sb ra, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
-; RV64I-NEXT:    sb zero, 119(sp)
-; RV64I-NEXT:    sb zero, 118(sp)
-; RV64I-NEXT:    sb zero, 117(sp)
-; RV64I-NEXT:    sb zero, 116(sp)
-; RV64I-NEXT:    sb zero, 115(sp)
-; RV64I-NEXT:    sb zero, 114(sp)
-; RV64I-NEXT:    sb zero, 113(sp)
-; RV64I-NEXT:    sb zero, 112(sp)
-; RV64I-NEXT:    sb zero, 111(sp)
-; RV64I-NEXT:    sb zero, 110(sp)
-; RV64I-NEXT:    sb zero, 109(sp)
-; RV64I-NEXT:    sb zero, 108(sp)
-; RV64I-NEXT:    sb zero, 107(sp)
-; RV64I-NEXT:    sb zero, 106(sp)
-; RV64I-NEXT:    sb zero, 105(sp)
-; RV64I-NEXT:    sb zero, 104(sp)
-; RV64I-NEXT:    sb zero, 103(sp)
-; RV64I-NEXT:    sb zero, 102(sp)
-; RV64I-NEXT:    sb zero, 101(sp)
-; RV64I-NEXT:    sb zero, 100(sp)
-; RV64I-NEXT:    sb zero, 99(sp)
-; RV64I-NEXT:    sb zero, 98(sp)
-; RV64I-NEXT:    sb zero, 97(sp)
-; RV64I-NEXT:    sb zero, 96(sp)
-; RV64I-NEXT:    sb zero, 95(sp)
-; RV64I-NEXT:    sb zero, 94(sp)
-; RV64I-NEXT:    sb zero, 93(sp)
-; RV64I-NEXT:    sb zero, 92(sp)
-; RV64I-NEXT:    sb zero, 91(sp)
-; RV64I-NEXT:    sb zero, 90(sp)
-; RV64I-NEXT:    sb zero, 89(sp)
-; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t3, 64(sp)
-; RV64I-NEXT:    sb t2, 63(sp)
-; RV64I-NEXT:    sb t1, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    slli a0, t0, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a3, sp, 56
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    lbu a0, 9(a3)
-; RV64I-NEXT:    lbu a1, 8(a3)
-; RV64I-NEXT:    lbu a4, 10(a3)
-; RV64I-NEXT:    lbu a5, 11(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a1, 13(a3)
-; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a5, 14(a3)
-; RV64I-NEXT:    lbu a6, 15(a3)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a4, a1, a0
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a0, 17(a3)
-; RV64I-NEXT:    lbu a5, 16(a3)
-; RV64I-NEXT:    lbu a6, 18(a3)
-; RV64I-NEXT:    lbu a7, 19(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a3)
-; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu a7, 22(a3)
-; RV64I-NEXT:    lbu t0, 23(a3)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a1
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a3)
-; RV64I-NEXT:    lbu a7, 0(a3)
-; RV64I-NEXT:    lbu t0, 2(a3)
-; RV64I-NEXT:    lbu t1, 3(a3)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a3)
-; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t1, 6(a3)
-; RV64I-NEXT:    lbu t2, 7(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a3)
-; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t1, 26(a3)
-; RV64I-NEXT:    lbu t2, 27(a3)
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    lbu t1, 28(a3)
-; RV64I-NEXT:    lbu t2, 30(a3)
-; RV64I-NEXT:    lbu a3, 31(a3)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a3, t2
-; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a3, a3, t0
-; RV64I-NEXT:    xori t0, a1, 63
-; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a7, a3, a7
-; RV64I-NEXT:    slli a3, a7, 1
-; RV64I-NEXT:    sll t0, a3, t0
-; RV64I-NEXT:    srl a3, a4, a1
-; RV64I-NEXT:    srl a4, a6, a1
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    srl a0, a4, a1
+; RV64I-NEXT:    ld a5, 16(a3)
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    slli t0, a5, 1
+; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    srl a7, a7, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    srl a1, a7, a1
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a5, t0
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a6, a7, a6
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    srl a1, a3, a1
 ; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
 ; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    srli a1, a4, 40
@@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 3(a2)
 ; RV64I-NEXT:    srli a1, a4, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    or a1, a4, t1
-; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a3, 48
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a3, 40
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a3, 32
-; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a3, 16
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    srli a3, a6, 56
-; RV64I-NEXT:    sb a3, 23(a2)
-; RV64I-NEXT:    srli a1, a1, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 59(sp)
-; RV32I-NEXT:    sb a3, 58(sp)
-; RV32I-NEXT:    sb a4, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a5, 55(sp)
-; RV32I-NEXT:    sb a6, 54(sp)
-; RV32I-NEXT:    sb a7, 53(sp)
-; RV32I-NEXT:    sb s10, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
-; RV32I-NEXT:    sb zero, 91(sp)
-; RV32I-NEXT:    sb zero, 90(sp)
-; RV32I-NEXT:    sb zero, 89(sp)
-; RV32I-NEXT:    sb zero, 88(sp)
-; RV32I-NEXT:    sb zero, 87(sp)
-; RV32I-NEXT:    sb zero, 86(sp)
-; RV32I-NEXT:    sb zero, 85(sp)
-; RV32I-NEXT:    sb zero, 84(sp)
-; RV32I-NEXT:    sb zero, 83(sp)
-; RV32I-NEXT:    sb zero, 82(sp)
-; RV32I-NEXT:    sb zero, 81(sp)
-; RV32I-NEXT:    sb zero, 80(sp)
-; RV32I-NEXT:    sb zero, 79(sp)
-; RV32I-NEXT:    sb zero, 78(sp)
-; RV32I-NEXT:    sb zero, 77(sp)
-; RV32I-NEXT:    sb zero, 76(sp)
-; RV32I-NEXT:    sb zero, 75(sp)
-; RV32I-NEXT:    sb zero, 74(sp)
-; RV32I-NEXT:    sb zero, 73(sp)
-; RV32I-NEXT:    sb zero, 72(sp)
-; RV32I-NEXT:    sb zero, 71(sp)
-; RV32I-NEXT:    sb zero, 70(sp)
-; RV32I-NEXT:    sb zero, 69(sp)
-; RV32I-NEXT:    sb zero, 68(sp)
-; RV32I-NEXT:    sb zero, 67(sp)
-; RV32I-NEXT:    sb zero, 66(sp)
-; RV32I-NEXT:    sb zero, 65(sp)
-; RV32I-NEXT:    sb zero, 64(sp)
-; RV32I-NEXT:    sb zero, 63(sp)
-; RV32I-NEXT:    sb zero, 62(sp)
-; RV32I-NEXT:    sb zero, 61(sp)
-; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb s3, 43(sp)
-; RV32I-NEXT:    sb s2, 42(sp)
-; RV32I-NEXT:    sb s1, 41(sp)
-; RV32I-NEXT:    sb s0, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, t0, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t0, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    srl a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    add a4, a1, a0
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    lw a5, 8(a4)
+; RV32I-NEXT:    andi a3, a7, 31
+; RV32I-NEXT:    xori a6, a3, 31
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 16(a4)
+; RV32I-NEXT:    sll a1, a1, a6
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srl a3, t0, a7
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or a3, a3, t2
+; RV32I-NEXT:    srl a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 24(a4)
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    srl t0, t2, a7
+; RV32I-NEXT:    slli t4, t3, 1
+; RV32I-NEXT:    sll t4, t4, a6
+; RV32I-NEXT:    or t0, t0, t4
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    srl t2, t3, a7
+; RV32I-NEXT:    slli t3, a4, 1
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    srl a4, a4, a7
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb t0, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, a6, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a4, a6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    sb a0, 116(sp)
-; RV64I-NEXT:    sb a5, 115(sp)
-; RV64I-NEXT:    sb a6, 114(sp)
-; RV64I-NEXT:    sb a7, 113(sp)
-; RV64I-NEXT:    sb s11, 112(sp)
-; RV64I-NEXT:    sb s10, 111(sp)
-; RV64I-NEXT:    sb ra, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
-; RV64I-NEXT:    sb zero, 87(sp)
-; RV64I-NEXT:    sb zero, 86(sp)
-; RV64I-NEXT:    sb zero, 85(sp)
-; RV64I-NEXT:    sb zero, 84(sp)
-; RV64I-NEXT:    sb zero, 83(sp)
-; RV64I-NEXT:    sb zero, 82(sp)
-; RV64I-NEXT:    sb zero, 81(sp)
-; RV64I-NEXT:    sb zero, 80(sp)
-; RV64I-NEXT:    sb zero, 79(sp)
-; RV64I-NEXT:    sb zero, 78(sp)
-; RV64I-NEXT:    sb zero, 77(sp)
-; RV64I-NEXT:    sb zero, 76(sp)
-; RV64I-NEXT:    sb zero, 75(sp)
-; RV64I-NEXT:    sb zero, 74(sp)
-; RV64I-NEXT:    sb zero, 73(sp)
-; RV64I-NEXT:    sb zero, 72(sp)
-; RV64I-NEXT:    sb zero, 71(sp)
-; RV64I-NEXT:    sb zero, 70(sp)
-; RV64I-NEXT:    sb zero, 69(sp)
-; RV64I-NEXT:    sb zero, 68(sp)
-; RV64I-NEXT:    sb zero, 67(sp)
-; RV64I-NEXT:    sb zero, 66(sp)
-; RV64I-NEXT:    sb zero, 65(sp)
-; RV64I-NEXT:    sb zero, 64(sp)
-; RV64I-NEXT:    sb zero, 63(sp)
-; RV64I-NEXT:    sb zero, 62(sp)
-; RV64I-NEXT:    sb zero, 61(sp)
-; RV64I-NEXT:    sb zero, 60(sp)
-; RV64I-NEXT:    sb zero, 59(sp)
-; RV64I-NEXT:    sb zero, 58(sp)
-; RV64I-NEXT:    sb zero, 57(sp)
-; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t2, 95(sp)
-; RV64I-NEXT:    sb t1, 94(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 93(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 92(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 91(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 90(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    slli a0, t0, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a1, sp, 88
-; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    lbu a3, 8(a0)
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a5, 11(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a3, 13(a0)
-; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu a6, 15(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a3, a1
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t0, 27(a0)
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t0, 30(a0)
-; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t0, 18(a0)
-; RV64I-NEXT:    lbu t1, 19(a0)
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    srli t0, a4, 1
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or t1, a0, t1
-; RV64I-NEXT:    xori t2, a1, 63
-; RV64I-NEXT:    srl a0, t0, t2
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    srli a7, a6, 1
-; RV64I-NEXT:    srl a7, a7, t2
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    sll a0, a4, a1
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    srli a7, a5, 1
+; RV64I-NEXT:    ld t0, 24(a3)
+; RV64I-NEXT:    ld a3, 16(a3)
+; RV64I-NEXT:    srl a7, a7, a6
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    sll a7, t0, a1
 ; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    not t1, a1
-; RV64I-NEXT:    srl t0, t0, t1
+; RV64I-NEXT:    srl t0, t0, a6
+; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    sll a5, a5, a1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    sll a1, a4, a1
-; RV64I-NEXT:    srli a4, a6, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a6, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a6, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a6, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a6, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    or a4, a6, t0
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 30(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 27(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    or a6, a5, a7
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 6(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a6
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    sll a1, a5, a1
 ; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a4, a1, 56
+; RV64I-NEXT:    sb a4, 7(a2)
+; RV64I-NEXT:    srli a4, a1, 48
+; RV64I-NEXT:    sb a4, 6(a2)
+; RV64I-NEXT:    srli a4, a1, 40
+; RV64I-NEXT:    sb a4, 5(a2)
+; RV64I-NEXT:    srli a4, a1, 32
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    srli a4, a1, 24
+; RV64I-NEXT:    sb a4, 3(a2)
+; RV64I-NEXT:    srli a4, a1, 16
+; RV64I-NEXT:    sb a4, 2(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb a3, 16(a2)
+; RV64I-NEXT:    sb a7, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    srli a1, a3, 56
-; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    sb a1, 23(a2)
 ; RV64I-NEXT:    srli a1, a3, 48
-; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
 ; RV64I-NEXT:    srli a1, a3, 40
-; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    sb a1, 21(a2)
 ; RV64I-NEXT:    srli a1, a3, 32
-; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    sb a1, 20(a2)
 ; RV64I-NEXT:    srli a1, a3, 24
-; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    sb a1, 19(a2)
 ; RV64I-NEXT:    srli a1, a3, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sb a1, 18(a2)
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a6, 24(a2)
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    sb a3, 17(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    sb a4, 89(sp)
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a5, 87(sp)
-; RV32I-NEXT:    sb a6, 86(sp)
-; RV32I-NEXT:    sb a7, 85(sp)
-; RV32I-NEXT:    sb s10, 84(sp)
-; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s11, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
-; RV32I-NEXT:    sb zero, 59(sp)
-; RV32I-NEXT:    sb zero, 58(sp)
-; RV32I-NEXT:    sb zero, 57(sp)
-; RV32I-NEXT:    sb zero, 56(sp)
-; RV32I-NEXT:    sb zero, 55(sp)
-; RV32I-NEXT:    sb zero, 54(sp)
-; RV32I-NEXT:    sb zero, 53(sp)
-; RV32I-NEXT:    sb zero, 52(sp)
-; RV32I-NEXT:    sb zero, 51(sp)
-; RV32I-NEXT:    sb zero, 50(sp)
-; RV32I-NEXT:    sb zero, 49(sp)
-; RV32I-NEXT:    sb zero, 48(sp)
-; RV32I-NEXT:    sb zero, 47(sp)
-; RV32I-NEXT:    sb zero, 46(sp)
-; RV32I-NEXT:    sb zero, 45(sp)
-; RV32I-NEXT:    sb zero, 44(sp)
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb s3, 75(sp)
-; RV32I-NEXT:    sb s2, 74(sp)
-; RV32I-NEXT:    sb s1, 73(sp)
-; RV32I-NEXT:    sb s0, 72(sp)
-; RV32I-NEXT:    sb t6, 71(sp)
-; RV32I-NEXT:    sb t5, 70(sp)
-; RV32I-NEXT:    sb t4, 69(sp)
-; RV32I-NEXT:    sb t3, 68(sp)
-; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, t0, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 60
-; RV32I-NEXT:    sub a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a1, t0, 7
-; RV32I-NEXT:    lbu a0, 1(a4)
-; RV32I-NEXT:    lbu a3, 0(a4)
-; RV32I-NEXT:    lbu a5, 2(a4)
-; RV32I-NEXT:    lbu a6, 3(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a6, a5
-; RV32I-NEXT:    or a6, a3, a0
-; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori a7, a1, 31
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    lbu a3, 13(a4)
-; RV32I-NEXT:    lbu a5, 12(a4)
-; RV32I-NEXT:    lbu t0, 14(a4)
-; RV32I-NEXT:    lbu t1, 15(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t1, t0
-; RV32I-NEXT:    or t0, a5, a3
-; RV32I-NEXT:    lbu a3, 9(a4)
-; RV32I-NEXT:    lbu a5, 8(a4)
-; RV32I-NEXT:    lbu t1, 10(a4)
-; RV32I-NEXT:    lbu t2, 11(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, t2, t1
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    srli a3, t1, 1
-; RV32I-NEXT:    srl a5, a3, a7
-; RV32I-NEXT:    srli t4, t5, 1
-; RV32I-NEXT:    not t2, a1
-; RV32I-NEXT:    lbu a3, 21(a4)
-; RV32I-NEXT:    lbu t3, 20(a4)
-; RV32I-NEXT:    lbu t6, 22(a4)
-; RV32I-NEXT:    lbu s0, 23(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t3, t3, a3
-; RV32I-NEXT:    lbu a3, 17(a4)
-; RV32I-NEXT:    lbu t6, 16(a4)
-; RV32I-NEXT:    lbu s0, 18(a4)
-; RV32I-NEXT:    lbu s1, 19(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a3
-; RV32I-NEXT:    lbu a3, 29(a4)
-; RV32I-NEXT:    lbu t6, 28(a4)
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu s2, 31(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    lbu s1, 25(a4)
-; RV32I-NEXT:    lbu s2, 24(a4)
-; RV32I-NEXT:    srl t4, t4, t2
-; RV32I-NEXT:    or t6, t6, a3
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s1, s2
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu a4, 27(a4)
-; RV32I-NEXT:    srli s2, s0, 1
-; RV32I-NEXT:    srl s2, s2, a7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    srl s1, s1, t2
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    srl a7, a3, a7
-; RV32I-NEXT:    srli a3, t3, 1
-; RV32I-NEXT:    srl t2, a3, t2
-; RV32I-NEXT:    sll a3, t5, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t3, t3, a1
-; RV32I-NEXT:    sll t5, s0, a1
-; RV32I-NEXT:    sll t6, t6, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll a1, a6, a1
-; RV32I-NEXT:    srli a6, a4, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a4, t2
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t2, 56(sp)
+; RV32I-NEXT:    sw t1, 52(sp)
+; RV32I-NEXT:    sw t0, 48(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    addi a1, sp, 32
+; RV32I-NEXT:    sub a4, a1, a0
+; RV32I-NEXT:    lw a3, 4(a4)
+; RV32I-NEXT:    lw a5, 0(a4)
+; RV32I-NEXT:    sll a0, a3, a7
+; RV32I-NEXT:    andi a1, a7, 31
+; RV32I-NEXT:    xori a6, a1, 31
+; RV32I-NEXT:    srli a1, a5, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 8(a4)
+; RV32I-NEXT:    srl a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a7
+; RV32I-NEXT:    srli t2, t1, 1
+; RV32I-NEXT:    srl t2, t2, a6
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 16(a4)
+; RV32I-NEXT:    srl a3, a3, a6
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    sll t1, t2, a7
+; RV32I-NEXT:    srli t4, t3, 1
+; RV32I-NEXT:    srl t4, t4, a6
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    srli t0, t0, 1
+; RV32I-NEXT:    lw t4, 28(a4)
+; RV32I-NEXT:    lw a4, 24(a4)
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    sll t3, t4, a7
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    srl t4, t4, a6
+; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    srli t2, t2, 1
+; RV32I-NEXT:    srl a6, t2, a6
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    sll a5, a5, a7
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    srli a6, a5, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a5, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb a4, 24(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb t1, 20(a2)
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 27(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 26(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t6, 24
+; RV32I-NEXT:    srli a4, t3, 24
 ; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t6, 16
+; RV32I-NEXT:    srli a4, t3, 16
 ; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    or a4, t6, a7
-; RV32I-NEXT:    srli a7, t6, 8
-; RV32I-NEXT:    sb a7, 29(a2)
-; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 19(a2)
-; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    or a7, t5, s1
-; RV32I-NEXT:    srli t2, t5, 8
-; RV32I-NEXT:    sb t2, 17(a2)
-; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 23(a2)
-; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    or t2, t3, s2
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, t1, 24
-; RV32I-NEXT:    sb t3, 11(a2)
-; RV32I-NEXT:    srli t3, t1, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, t1, t4
-; RV32I-NEXT:    srli t1, t1, 8
-; RV32I-NEXT:    sb t1, 9(a2)
-; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    sb t1, 15(a2)
-; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    sb t1, 14(a2)
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    srli t0, a1, 16
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a4, t3, 8
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t2, 20(a2)
-; RV32I-NEXT:    sb t3, 8(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t3, 6(a0)
-; RV64I-NEXT:    lbu t4, 7(a0)
-; RV64I-NEXT:    lbu t5, 8(a0)
-; RV64I-NEXT:    lbu t6, 9(a0)
-; RV64I-NEXT:    lbu s0, 10(a0)
-; RV64I-NEXT:    lbu s1, 11(a0)
-; RV64I-NEXT:    lbu s2, 12(a0)
-; RV64I-NEXT:    lbu s3, 13(a0)
-; RV64I-NEXT:    lbu s4, 14(a0)
-; RV64I-NEXT:    lbu s5, 15(a0)
-; RV64I-NEXT:    lbu s6, 16(a0)
-; RV64I-NEXT:    lbu s7, 17(a0)
-; RV64I-NEXT:    lbu s8, 18(a0)
-; RV64I-NEXT:    lbu s9, 19(a0)
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or a3, s11, a3
-; RV64I-NEXT:    lbu s11, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s11
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t2, a1, a3
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a4, 28(a0)
-; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a4, 84(sp)
-; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb t0, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
-; RV64I-NEXT:    sb s9, 75(sp)
-; RV64I-NEXT:    sb s8, 74(sp)
-; RV64I-NEXT:    sb s7, 73(sp)
-; RV64I-NEXT:    sb s6, 72(sp)
-; RV64I-NEXT:    sb s5, 71(sp)
-; RV64I-NEXT:    sb s4, 70(sp)
-; RV64I-NEXT:    sb s3, 69(sp)
-; RV64I-NEXT:    sb s2, 68(sp)
-; RV64I-NEXT:    sb s1, 67(sp)
-; RV64I-NEXT:    sb s0, 66(sp)
-; RV64I-NEXT:    sb t6, 65(sp)
-; RV64I-NEXT:    sb t5, 64(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb t4, 63(sp)
-; RV64I-NEXT:    sb t3, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
-; RV64I-NEXT:    sb a0, 112(sp)
-; RV64I-NEXT:    sb a0, 104(sp)
-; RV64I-NEXT:    sb a0, 96(sp)
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    srli a6, a0, 24
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    srli a7, a0, 16
-; RV64I-NEXT:    sb a7, 114(sp)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 113(sp)
-; RV64I-NEXT:    sb a1, 111(sp)
-; RV64I-NEXT:    sb a3, 110(sp)
-; RV64I-NEXT:    sb a4, 109(sp)
-; RV64I-NEXT:    sb a5, 108(sp)
-; RV64I-NEXT:    sb a6, 107(sp)
-; RV64I-NEXT:    sb a7, 106(sp)
-; RV64I-NEXT:    sb a0, 105(sp)
-; RV64I-NEXT:    sb a1, 103(sp)
-; RV64I-NEXT:    sb a3, 102(sp)
-; RV64I-NEXT:    sb a4, 101(sp)
-; RV64I-NEXT:    sb a5, 100(sp)
-; RV64I-NEXT:    sb a6, 99(sp)
-; RV64I-NEXT:    sb a7, 98(sp)
-; RV64I-NEXT:    sb a0, 97(sp)
-; RV64I-NEXT:    sb a1, 95(sp)
-; RV64I-NEXT:    sb a3, 94(sp)
-; RV64I-NEXT:    sb a4, 93(sp)
-; RV64I-NEXT:    sb a5, 92(sp)
-; RV64I-NEXT:    sb a6, 91(sp)
-; RV64I-NEXT:    sb a7, 90(sp)
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    slli a0, t2, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    lbu a0, 9(a1)
-; RV64I-NEXT:    lbu a3, 8(a1)
-; RV64I-NEXT:    lbu a4, 10(a1)
-; RV64I-NEXT:    lbu a5, 11(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a3, 13(a1)
-; RV64I-NEXT:    lbu a4, 12(a1)
-; RV64I-NEXT:    lbu a5, 14(a1)
-; RV64I-NEXT:    lbu a6, 15(a1)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a4, a3, a0
-; RV64I-NEXT:    andi a3, t2, 7
-; RV64I-NEXT:    lbu a0, 17(a1)
-; RV64I-NEXT:    lbu a5, 16(a1)
-; RV64I-NEXT:    lbu a6, 18(a1)
-; RV64I-NEXT:    lbu a7, 19(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a1)
-; RV64I-NEXT:    lbu a6, 20(a1)
-; RV64I-NEXT:    lbu a7, 22(a1)
-; RV64I-NEXT:    lbu t0, 23(a1)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a3
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a1)
-; RV64I-NEXT:    lbu t0, 24(a1)
-; RV64I-NEXT:    lbu t1, 26(a1)
-; RV64I-NEXT:    lbu t2, 27(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a1)
-; RV64I-NEXT:    lbu t1, 28(a1)
-; RV64I-NEXT:    lbu t2, 30(a1)
-; RV64I-NEXT:    lbu a1, 31(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or t0, t0, t1
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
-; RV64I-NEXT:    slli t1, a4, 1
 ; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    xori t0, a3, 63
-; RV64I-NEXT:    sll t1, t1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll t0, a1, t0
-; RV64I-NEXT:    srl a1, a4, a3
-; RV64I-NEXT:    srl a4, a6, a3
-; RV64I-NEXT:    srl a5, a5, a3
-; RV64I-NEXT:    sra a3, a7, a3
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a5, t0
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    srl a0, a4, a1
+; RV64I-NEXT:    ld a5, 16(a3)
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    slli t0, a5, 1
+; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    srl a7, a7, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a6, a7, a6
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
 ; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    sb a1, 18(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a3, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a3, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a3, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a3, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 25(a2)
-; RV64I-NEXT:    srli a3, a4, 48
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    srli a3, a4, 40
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    srli a3, a4, 32
-; RV64I-NEXT:    sb a3, 4(a2)
-; RV64I-NEXT:    srli a3, a4, 24
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a3, a4, 16
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    or a3, a4, t1
-; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a6, 56
-; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a3, a3, 56
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or a3, a3, s11
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    or t1, a1, a3
-; RV32I-NEXT:    lbu t0, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a4, 28(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a4, 56(sp)
-; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb t0, 51(sp)
-; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
-; RV32I-NEXT:    sb s10, 48(sp)
-; RV32I-NEXT:    sb s9, 47(sp)
-; RV32I-NEXT:    sb s8, 46(sp)
-; RV32I-NEXT:    sb s7, 45(sp)
-; RV32I-NEXT:    sb s6, 44(sp)
-; RV32I-NEXT:    sb s5, 43(sp)
-; RV32I-NEXT:    sb t3, 59(sp)
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb s1, 39(sp)
-; RV32I-NEXT:    sb s0, 38(sp)
-; RV32I-NEXT:    sb t6, 37(sp)
-; RV32I-NEXT:    sb t5, 36(sp)
-; RV32I-NEXT:    sb t4, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t3, 31
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a0, 84(sp)
-; RV32I-NEXT:    sb a0, 80(sp)
-; RV32I-NEXT:    sb a0, 76(sp)
-; RV32I-NEXT:    sb a0, 72(sp)
-; RV32I-NEXT:    sb a0, 68(sp)
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a1, 87(sp)
-; RV32I-NEXT:    sb a3, 86(sp)
-; RV32I-NEXT:    sb a0, 85(sp)
-; RV32I-NEXT:    sb a1, 83(sp)
-; RV32I-NEXT:    sb a3, 82(sp)
-; RV32I-NEXT:    sb a0, 81(sp)
-; RV32I-NEXT:    sb a1, 79(sp)
-; RV32I-NEXT:    sb a3, 78(sp)
-; RV32I-NEXT:    sb a0, 77(sp)
-; RV32I-NEXT:    sb a1, 75(sp)
-; RV32I-NEXT:    sb a3, 74(sp)
-; RV32I-NEXT:    sb a0, 73(sp)
-; RV32I-NEXT:    sb a1, 71(sp)
-; RV32I-NEXT:    sb a3, 70(sp)
-; RV32I-NEXT:    sb a0, 69(sp)
-; RV32I-NEXT:    sb a1, 67(sp)
-; RV32I-NEXT:    sb a3, 66(sp)
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    sb a1, 63(sp)
-; RV32I-NEXT:    sb a3, 62(sp)
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    slli a0, t1, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t1, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    sra a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t3, t3, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t3, 28(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    add a4, a1, a0
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    lw a5, 8(a4)
+; RV32I-NEXT:    andi a3, a7, 31
+; RV32I-NEXT:    xori a6, a3, 31
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 16(a4)
+; RV32I-NEXT:    sll a1, a1, a6
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srl a3, t0, a7
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or a3, a3, t2
+; RV32I-NEXT:    srl a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 24(a4)
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    srl t0, t2, a7
+; RV32I-NEXT:    slli t4, t3, 1
+; RV32I-NEXT:    sll t4, t4, a6
+; RV32I-NEXT:    or t0, t0, t4
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    srl t2, t3, a7
+; RV32I-NEXT:    slli t3, a4, 1
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    sra a4, a4, a7
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb t0, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, a6, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a4, a6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1

>From 90bdd43610ce819f50898df8b4adbc97aea9023b Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Fri, 21 Jun 2024 13:52:26 +0200
Subject: [PATCH 2/4] Address review comments

Use unaligned memory access only if target supports fast unaligned
memory access and the shift amount is a multiple of CHAR_BITS.

Addressing formatting reviews.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   55 +-
 .../AArch64/wide-scalar-shift-legalization.ll |  117 +-
 llvm/test/CodeGen/Mips/llvm-ir/ashr.ll        |  331 +-
 llvm/test/CodeGen/Mips/llvm-ir/lshr.ll        |  323 +-
 llvm/test/CodeGen/Mips/llvm-ir/shl.ll         |  302 +-
 llvm/test/CodeGen/PowerPC/ctrloop-sh.ll       |  244 +-
 llvm/test/CodeGen/PowerPC/pr59074.ll          |   80 +-
 .../PowerPC/wide-scalar-shift-legalization.ll |  644 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |  431 +-
 .../div-rem-pair-recomposition-unsigned.ll    |  346 +-
 llvm/test/CodeGen/X86/pr38539.ll              |  144 +-
 .../CodeGen/X86/scheduler-backtracking.ll     |   50 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |  546 +-
 llvm/test/CodeGen/X86/shift-i256.ll           |  358 +-
 .../X86/wide-scalar-shift-legalization.ll     | 7086 ++++++++---------
 15 files changed, 5019 insertions(+), 6038 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f21ed7581a5af..cd40df473c67c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4532,19 +4532,29 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
 
   EVT LoadStoreVT = VT;
   do {
-      LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
-  }while (!TLI.isTypeLegal(LoadStoreVT));
+    LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
+  } while (!TLI.isTypeLegal(LoadStoreVT));
 
-  const Align LoadStoreAlign = [&]() -> Align {
-      if (TLI.allowsMisalignedMemoryAccesses(LoadStoreVT))
-          return Align(1);
+  const unsigned KnownTrailingZeros =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros();
 
-      return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
+  const Align LoadStoreAlign = [&]() -> Align {
+    unsigned IsFast = 0;
+    const bool AllowsFastMisalignedMemoryAccesses =
+        TLI.allowsMisalignedMemoryAccesses(
+            LoadStoreVT, /*AddrSpace*/ 0, /*Alignment*/ Align(1),
+            /*Flags*/ MachineMemOperand::MONone, &IsFast) &&
+        IsFast;
+    if (AllowsFastMisalignedMemoryAccesses && KnownTrailingZeros >= 3)
+      return Align(1);
+
+    return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
   }();
 
   const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
   const bool IsOneStepShift =
-      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= Log2_32(ShiftUnitInBits);
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
+      Log2_32(ShiftUnitInBits);
 
   // If we can't do it as one step, we'll have two uses of shift amount,
   // and thus must freeze it.
@@ -4590,20 +4600,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   // We have shift amount, which is in bits. Offset should point to an aligned
   // address.
   SDNodeFlags Flags;
+
   if (IsOneStepShift)
     Flags.setExact(true);
-  SDValue OffsetInBits = DAG.getNode(ISD::SHL, dl, ShAmtVT,
-                                     DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags),
-                                     DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+  SDValue SrlTmp = DAG.getNode(
+      ISD::SRL, dl, ShAmtVT, ShAmt,
+      DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags);
+  SDValue OffsetInBits =
+      DAG.getNode(ISD::SHL, dl, ShAmtVT, SrlTmp,
+                  DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+
   Flags.setExact(true);
-  SDValue Offset = DAG.getNode(
-      ISD::SRL, dl, ShAmtVT,
-      OffsetInBits,
-      DAG.getConstant(3, dl, ShAmtVT), Flags);
+  SDValue Offset = DAG.getNode(ISD::SRL, dl, ShAmtVT, OffsetInBits,
+                               DAG.getConstant(3, dl, ShAmtVT), Flags);
   // And clamp it, because OOB load is an immediate UB,
   // while shift overflow would have *just* been poison.
   Offset = DAG.getNode(ISD::AND, dl, ShAmtVT, Offset,
-                           DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
+                       DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
   // We have exactly two strategies on indexing into stack slot here:
   // 1. upwards starting from the beginning of the slot
   // 2. downwards starting from the middle of the slot
@@ -4627,15 +4640,17 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
 
   // And load it! While the load is not legal, legalizing it is obvious.
-  SDValue Res = DAG.getLoad(
-      VT, dl, Ch, AdjStackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), LoadStoreAlign);
+  SDValue Res =
+      DAG.getLoad(VT, dl, Ch, AdjStackPtr,
+                  MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+                  LoadStoreAlign);
   // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
 
   // If we may still have a remaining bits to shift by, do so now.
   if (!IsOneStepShift) {
-    SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
-                                   DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
+    SDValue ShAmtRem =
+        DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+                    DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
     Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index a4da6db57ecae..531e0fa740da7 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -160,30 +160,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
-; ALL-NEXT:    ubfx x8, x10, #3, #5
+; ALL-NEXT:    lsr x8, x10, #3
 ; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q1, [sp]
-; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x8, x8, #0x18
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    eor x12, x12, #0x3f
 ; ALL-NEXT:    add x8, x9, x8
-; ALL-NEXT:    mvn w13, w10
-; ALL-NEXT:    ldp x11, x9, [x8, #16]
-; ALL-NEXT:    ldp x8, x12, [x8]
+; ALL-NEXT:    ldp x13, x11, [x8]
+; ALL-NEXT:    ldr x9, [x8, #24]
+; ALL-NEXT:    ldr x8, [x8, #16]
 ; ALL-NEXT:    lsl x14, x9, #1
+; ALL-NEXT:    lsr x9, x9, x10
 ; ALL-NEXT:    lsl x15, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x10
-; ALL-NEXT:    lsl x16, x12, #1
-; ALL-NEXT:    lsr x9, x9, x10
-; ALL-NEXT:    lsr x12, x12, x10
-; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x13, x13, x10
+; ALL-NEXT:    lsl x14, x14, x12
+; ALL-NEXT:    lsl x12, x15, x12
+; ALL-NEXT:    lsl x15, x8, #1
 ; ALL-NEXT:    lsr x8, x8, x10
-; ALL-NEXT:    lsl x10, x16, x13
-; ALL-NEXT:    lsl x13, x15, x13
-; ALL-NEXT:    orr x11, x14, x11
-; ALL-NEXT:    stp x11, x9, [x2, #16]
-; ALL-NEXT:    orr x8, x10, x8
+; ALL-NEXT:    mvn w10, w10
+; ALL-NEXT:    lsl x10, x15, x10
+; ALL-NEXT:    orr x8, x14, x8
+; ALL-NEXT:    stp x8, x9, [x2, #16]
 ; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    stp x9, x8, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -201,31 +204,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #48]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ubfx x9, x10, #3, #5
-; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    lsr x8, x10, #3
+; ALL-NEXT:    mov x9, sp
+; ALL-NEXT:    add x9, x9, #32
 ; ALL-NEXT:    stp q0, q1, [sp, #16]
-; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x8, x8, #0x18
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    mvn w13, w10
-; ALL-NEXT:    ldp x9, x11, [x8]
-; ALL-NEXT:    ldp x12, x8, [x8, #16]
-; ALL-NEXT:    lsr x14, x9, #1
-; ALL-NEXT:    lsr x15, x11, #1
-; ALL-NEXT:    lsl x11, x11, x10
-; ALL-NEXT:    lsr x16, x12, #1
+; ALL-NEXT:    eor x12, x12, #0x3f
+; ALL-NEXT:    sub x8, x9, x8
+; ALL-NEXT:    ldp x11, x13, [x8, #16]
+; ALL-NEXT:    ldr x9, [x8]
+; ALL-NEXT:    ldr x8, [x8, #8]
+; ALL-NEXT:    lsr x15, x9, #1
 ; ALL-NEXT:    lsl x9, x9, x10
-; ALL-NEXT:    lsl x12, x12, x10
-; ALL-NEXT:    lsr x14, x14, x13
+; ALL-NEXT:    lsr x14, x11, #1
+; ALL-NEXT:    lsl x11, x11, x10
+; ALL-NEXT:    lsl x13, x13, x10
+; ALL-NEXT:    lsr x14, x14, x12
+; ALL-NEXT:    lsr x12, x15, x12
+; ALL-NEXT:    lsr x15, x8, #1
 ; ALL-NEXT:    lsl x8, x8, x10
-; ALL-NEXT:    lsr x10, x16, x13
-; ALL-NEXT:    lsr x13, x15, x13
-; ALL-NEXT:    orr x11, x11, x14
-; ALL-NEXT:    stp x9, x11, [x2]
-; ALL-NEXT:    orr x8, x8, x10
-; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    stp x9, x8, [x2, #16]
+; ALL-NEXT:    mvn w10, w10
+; ALL-NEXT:    lsr x10, x15, x10
+; ALL-NEXT:    orr x8, x8, x12
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    orr x9, x13, x14
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    stp x8, x9, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -243,31 +249,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    lsr x9, x10, #3
 ; ALL-NEXT:    asr x8, x8, #63
-; ALL-NEXT:    ubfx x9, x10, #3, #5
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x9, x9, #0x18
 ; ALL-NEXT:    stp x8, x8, [sp, #48]
-; ALL-NEXT:    add x9, x11, x9
-; ALL-NEXT:    mvn w13, w10
+; ALL-NEXT:    eor x12, x12, #0x3f
 ; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    ldp x11, x8, [x9, #16]
-; ALL-NEXT:    ldp x9, x12, [x9]
-; ALL-NEXT:    lsl x14, x8, #1
+; ALL-NEXT:    add x8, x11, x9
+; ALL-NEXT:    ldp x13, x11, [x8]
+; ALL-NEXT:    ldr x9, [x8, #24]
+; ALL-NEXT:    ldr x8, [x8, #16]
+; ALL-NEXT:    lsl x14, x9, #1
+; ALL-NEXT:    asr x9, x9, x10
 ; ALL-NEXT:    lsl x15, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x10
-; ALL-NEXT:    lsl x16, x12, #1
-; ALL-NEXT:    asr x8, x8, x10
-; ALL-NEXT:    lsr x12, x12, x10
-; ALL-NEXT:    lsl x14, x14, x13
-; ALL-NEXT:    lsr x9, x9, x10
-; ALL-NEXT:    lsl x10, x16, x13
-; ALL-NEXT:    lsl x13, x15, x13
-; ALL-NEXT:    orr x11, x14, x11
-; ALL-NEXT:    stp x11, x8, [x2, #16]
-; ALL-NEXT:    orr x8, x10, x9
+; ALL-NEXT:    lsr x13, x13, x10
+; ALL-NEXT:    lsl x14, x14, x12
+; ALL-NEXT:    lsl x12, x15, x12
+; ALL-NEXT:    lsl x15, x8, #1
+; ALL-NEXT:    lsr x8, x8, x10
+; ALL-NEXT:    mvn w10, w10
+; ALL-NEXT:    lsl x10, x15, x10
+; ALL-NEXT:    orr x8, x14, x8
+; ALL-NEXT:    stp x8, x9, [x2, #16]
 ; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    stp x9, x8, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 450fe968d4917..6db3fb930b94e 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -382,53 +382,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS:       # %bb.0: # %entry
 ; MIPS-NEXT:    addiu $sp, $sp, -32
 ; MIPS-NEXT:    .cfi_def_cfa_offset 32
-; MIPS-NEXT:    swl $7, 28($sp)
-; MIPS-NEXT:    swl $6, 24($sp)
 ; MIPS-NEXT:    sra $1, $4, 31
-; MIPS-NEXT:    swl $5, 20($sp)
-; MIPS-NEXT:    swl $4, 16($sp)
-; MIPS-NEXT:    swl $1, 12($sp)
-; MIPS-NEXT:    swl $1, 8($sp)
-; MIPS-NEXT:    swl $1, 4($sp)
-; MIPS-NEXT:    swl $1, 0($sp)
-; MIPS-NEXT:    addiu $2, $sp, 0
-; MIPS-NEXT:    swr $7, 31($sp)
-; MIPS-NEXT:    swr $6, 27($sp)
-; MIPS-NEXT:    swr $5, 23($sp)
-; MIPS-NEXT:    swr $4, 19($sp)
-; MIPS-NEXT:    swr $1, 15($sp)
-; MIPS-NEXT:    swr $1, 11($sp)
-; MIPS-NEXT:    swr $1, 7($sp)
-; MIPS-NEXT:    swr $1, 3($sp)
-; MIPS-NEXT:    addiu $1, $2, 16
+; MIPS-NEXT:    sw $7, 28($sp)
+; MIPS-NEXT:    sw $6, 24($sp)
+; MIPS-NEXT:    sw $5, 20($sp)
+; MIPS-NEXT:    sw $4, 16($sp)
+; MIPS-NEXT:    sw $1, 12($sp)
+; MIPS-NEXT:    sw $1, 8($sp)
+; MIPS-NEXT:    sw $1, 4($sp)
+; MIPS-NEXT:    sw $1, 0($sp)
+; MIPS-NEXT:    addiu $1, $sp, 0
+; MIPS-NEXT:    addiu $1, $1, 16
 ; MIPS-NEXT:    lw $2, 60($sp)
 ; MIPS-NEXT:    srl $3, $2, 3
-; MIPS-NEXT:    andi $3, $3, 15
+; MIPS-NEXT:    andi $3, $3, 12
 ; MIPS-NEXT:    subu $1, $1, $3
-; MIPS-NEXT:    lwl $3, 4($1)
-; MIPS-NEXT:    lwr $3, 7($1)
-; MIPS-NEXT:    sll $4, $3, 1
-; MIPS-NEXT:    lwl $5, 8($1)
-; MIPS-NEXT:    lwr $5, 11($1)
-; MIPS-NEXT:    andi $2, $2, 7
-; MIPS-NEXT:    not $6, $2
-; MIPS-NEXT:    srlv $7, $5, $2
-; MIPS-NEXT:    sllv $4, $4, $6
+; MIPS-NEXT:    lw $3, 4($1)
+; MIPS-NEXT:    lw $5, 8($1)
+; MIPS-NEXT:    srlv $4, $5, $2
+; MIPS-NEXT:    sll $6, $3, 1
+; MIPS-NEXT:    andi $7, $2, 31
+; MIPS-NEXT:    xori $7, $7, 31
+; MIPS-NEXT:    sllv $6, $6, $7
 ; MIPS-NEXT:    srlv $3, $3, $2
-; MIPS-NEXT:    lwl $6, 0($1)
-; MIPS-NEXT:    lwr $6, 3($1)
-; MIPS-NEXT:    sll $8, $6, 1
-; MIPS-NEXT:    xori $9, $2, 31
-; MIPS-NEXT:    sllv $8, $8, $9
-; MIPS-NEXT:    or $3, $3, $8
-; MIPS-NEXT:    or $4, $7, $4
-; MIPS-NEXT:    lwl $7, 12($1)
-; MIPS-NEXT:    lwr $7, 15($1)
-; MIPS-NEXT:    srlv $1, $7, $2
+; MIPS-NEXT:    lw $8, 0($1)
+; MIPS-NEXT:    sll $9, $8, 1
+; MIPS-NEXT:    sllv $9, $9, $7
+; MIPS-NEXT:    or $3, $3, $9
+; MIPS-NEXT:    or $4, $4, $6
+; MIPS-NEXT:    lw $1, 12($1)
+; MIPS-NEXT:    srlv $1, $1, $2
 ; MIPS-NEXT:    sll $5, $5, 1
-; MIPS-NEXT:    sllv $5, $5, $9
+; MIPS-NEXT:    sllv $5, $5, $7
 ; MIPS-NEXT:    or $5, $1, $5
-; MIPS-NEXT:    srav $2, $6, $2
+; MIPS-NEXT:    srav $2, $8, $2
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    addiu $sp, $sp, 32
 ;
@@ -436,53 +423,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    swl $7, 28($sp)
-; MIPS32-NEXT:    swl $6, 24($sp)
 ; MIPS32-NEXT:    sra $1, $4, 31
-; MIPS32-NEXT:    swl $5, 20($sp)
-; MIPS32-NEXT:    swl $4, 16($sp)
-; MIPS32-NEXT:    swl $1, 12($sp)
-; MIPS32-NEXT:    swl $1, 8($sp)
-; MIPS32-NEXT:    swl $1, 4($sp)
-; MIPS32-NEXT:    swl $1, 0($sp)
-; MIPS32-NEXT:    addiu $2, $sp, 0
-; MIPS32-NEXT:    swr $7, 31($sp)
-; MIPS32-NEXT:    swr $6, 27($sp)
-; MIPS32-NEXT:    swr $5, 23($sp)
-; MIPS32-NEXT:    swr $4, 19($sp)
-; MIPS32-NEXT:    swr $1, 15($sp)
-; MIPS32-NEXT:    swr $1, 11($sp)
-; MIPS32-NEXT:    swr $1, 7($sp)
-; MIPS32-NEXT:    swr $1, 3($sp)
-; MIPS32-NEXT:    addiu $1, $2, 16
+; MIPS32-NEXT:    sw $7, 28($sp)
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    sw $5, 20($sp)
+; MIPS32-NEXT:    sw $4, 16($sp)
+; MIPS32-NEXT:    sw $1, 12($sp)
+; MIPS32-NEXT:    sw $1, 8($sp)
+; MIPS32-NEXT:    sw $1, 4($sp)
+; MIPS32-NEXT:    sw $1, 0($sp)
+; MIPS32-NEXT:    addiu $1, $sp, 0
+; MIPS32-NEXT:    addiu $1, $1, 16
 ; MIPS32-NEXT:    lw $2, 60($sp)
 ; MIPS32-NEXT:    srl $3, $2, 3
-; MIPS32-NEXT:    andi $3, $3, 15
+; MIPS32-NEXT:    andi $3, $3, 12
 ; MIPS32-NEXT:    subu $1, $1, $3
-; MIPS32-NEXT:    lwl $3, 4($1)
-; MIPS32-NEXT:    lwr $3, 7($1)
-; MIPS32-NEXT:    sll $4, $3, 1
-; MIPS32-NEXT:    lwl $5, 8($1)
-; MIPS32-NEXT:    lwr $5, 11($1)
-; MIPS32-NEXT:    andi $2, $2, 7
-; MIPS32-NEXT:    not $6, $2
-; MIPS32-NEXT:    srlv $7, $5, $2
-; MIPS32-NEXT:    sllv $4, $4, $6
+; MIPS32-NEXT:    lw $3, 4($1)
+; MIPS32-NEXT:    lw $5, 8($1)
+; MIPS32-NEXT:    srlv $4, $5, $2
+; MIPS32-NEXT:    sll $6, $3, 1
+; MIPS32-NEXT:    andi $7, $2, 31
+; MIPS32-NEXT:    xori $7, $7, 31
+; MIPS32-NEXT:    sllv $6, $6, $7
 ; MIPS32-NEXT:    srlv $3, $3, $2
-; MIPS32-NEXT:    lwl $6, 0($1)
-; MIPS32-NEXT:    lwr $6, 3($1)
-; MIPS32-NEXT:    sll $8, $6, 1
-; MIPS32-NEXT:    xori $9, $2, 31
-; MIPS32-NEXT:    sllv $8, $8, $9
-; MIPS32-NEXT:    or $3, $3, $8
-; MIPS32-NEXT:    or $4, $7, $4
-; MIPS32-NEXT:    lwl $7, 12($1)
-; MIPS32-NEXT:    lwr $7, 15($1)
-; MIPS32-NEXT:    srlv $1, $7, $2
+; MIPS32-NEXT:    lw $8, 0($1)
+; MIPS32-NEXT:    sll $9, $8, 1
+; MIPS32-NEXT:    sllv $9, $9, $7
+; MIPS32-NEXT:    or $3, $3, $9
+; MIPS32-NEXT:    or $4, $4, $6
+; MIPS32-NEXT:    lw $1, 12($1)
+; MIPS32-NEXT:    srlv $1, $1, $2
 ; MIPS32-NEXT:    sll $5, $5, 1
-; MIPS32-NEXT:    sllv $5, $5, $9
+; MIPS32-NEXT:    sllv $5, $5, $7
 ; MIPS32-NEXT:    or $5, $1, $5
-; MIPS32-NEXT:    srav $2, $6, $2
+; MIPS32-NEXT:    srav $2, $8, $2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
@@ -490,52 +464,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R2:       # %bb.0: # %entry
 ; 32R2-NEXT:    addiu $sp, $sp, -32
 ; 32R2-NEXT:    .cfi_def_cfa_offset 32
-; 32R2-NEXT:    swl $7, 28($sp)
-; 32R2-NEXT:    swl $6, 24($sp)
-; 32R2-NEXT:    swl $5, 20($sp)
 ; 32R2-NEXT:    sra $1, $4, 31
-; 32R2-NEXT:    swl $4, 16($sp)
-; 32R2-NEXT:    swl $1, 12($sp)
-; 32R2-NEXT:    swl $1, 8($sp)
-; 32R2-NEXT:    swl $1, 4($sp)
-; 32R2-NEXT:    swl $1, 0($sp)
-; 32R2-NEXT:    swr $7, 31($sp)
-; 32R2-NEXT:    swr $6, 27($sp)
-; 32R2-NEXT:    swr $5, 23($sp)
-; 32R2-NEXT:    swr $4, 19($sp)
-; 32R2-NEXT:    swr $1, 15($sp)
-; 32R2-NEXT:    swr $1, 11($sp)
-; 32R2-NEXT:    swr $1, 7($sp)
-; 32R2-NEXT:    swr $1, 3($sp)
+; 32R2-NEXT:    sw $7, 28($sp)
+; 32R2-NEXT:    sw $6, 24($sp)
+; 32R2-NEXT:    sw $5, 20($sp)
+; 32R2-NEXT:    sw $4, 16($sp)
+; 32R2-NEXT:    sw $1, 12($sp)
+; 32R2-NEXT:    sw $1, 8($sp)
+; 32R2-NEXT:    sw $1, 4($sp)
+; 32R2-NEXT:    sw $1, 0($sp)
 ; 32R2-NEXT:    addiu $1, $sp, 0
 ; 32R2-NEXT:    addiu $1, $1, 16
 ; 32R2-NEXT:    lw $2, 60($sp)
-; 32R2-NEXT:    ext $3, $2, 3, 4
+; 32R2-NEXT:    srl $3, $2, 3
+; 32R2-NEXT:    andi $3, $3, 12
 ; 32R2-NEXT:    subu $1, $1, $3
-; 32R2-NEXT:    lwl $3, 4($1)
-; 32R2-NEXT:    lwr $3, 7($1)
-; 32R2-NEXT:    sll $4, $3, 1
-; 32R2-NEXT:    lwl $5, 8($1)
-; 32R2-NEXT:    lwr $5, 11($1)
-; 32R2-NEXT:    andi $2, $2, 7
-; 32R2-NEXT:    not $6, $2
-; 32R2-NEXT:    srlv $7, $5, $2
-; 32R2-NEXT:    sllv $4, $4, $6
+; 32R2-NEXT:    lw $3, 4($1)
+; 32R2-NEXT:    lw $5, 8($1)
+; 32R2-NEXT:    srlv $4, $5, $2
+; 32R2-NEXT:    sll $6, $3, 1
+; 32R2-NEXT:    andi $7, $2, 31
+; 32R2-NEXT:    xori $7, $7, 31
+; 32R2-NEXT:    sllv $6, $6, $7
 ; 32R2-NEXT:    srlv $3, $3, $2
-; 32R2-NEXT:    lwl $6, 0($1)
-; 32R2-NEXT:    lwr $6, 3($1)
-; 32R2-NEXT:    sll $8, $6, 1
-; 32R2-NEXT:    xori $9, $2, 31
-; 32R2-NEXT:    sllv $8, $8, $9
-; 32R2-NEXT:    or $3, $3, $8
-; 32R2-NEXT:    or $4, $7, $4
-; 32R2-NEXT:    lwl $7, 12($1)
-; 32R2-NEXT:    lwr $7, 15($1)
-; 32R2-NEXT:    srlv $1, $7, $2
+; 32R2-NEXT:    lw $8, 0($1)
+; 32R2-NEXT:    sll $9, $8, 1
+; 32R2-NEXT:    sllv $9, $9, $7
+; 32R2-NEXT:    or $3, $3, $9
+; 32R2-NEXT:    or $4, $4, $6
+; 32R2-NEXT:    lw $1, 12($1)
+; 32R2-NEXT:    srlv $1, $1, $2
 ; 32R2-NEXT:    sll $5, $5, 1
-; 32R2-NEXT:    sllv $5, $5, $9
+; 32R2-NEXT:    sllv $5, $5, $7
 ; 32R2-NEXT:    or $5, $1, $5
-; 32R2-NEXT:    srav $2, $6, $2
+; 32R2-NEXT:    srav $2, $8, $2
 ; 32R2-NEXT:    jr $ra
 ; 32R2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -555,28 +517,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R6-NEXT:    addiu $1, $sp, 0
 ; 32R6-NEXT:    addiu $1, $1, 16
 ; 32R6-NEXT:    lw $2, 60($sp)
-; 32R6-NEXT:    ext $3, $2, 3, 4
+; 32R6-NEXT:    srl $3, $2, 3
+; 32R6-NEXT:    andi $3, $3, 12
 ; 32R6-NEXT:    subu $1, $1, $3
 ; 32R6-NEXT:    lw $3, 4($1)
-; 32R6-NEXT:    sll $4, $3, 1
 ; 32R6-NEXT:    lw $5, 8($1)
-; 32R6-NEXT:    andi $2, $2, 7
-; 32R6-NEXT:    not $6, $2
-; 32R6-NEXT:    srlv $7, $5, $2
-; 32R6-NEXT:    sllv $4, $4, $6
+; 32R6-NEXT:    srlv $4, $5, $2
+; 32R6-NEXT:    sll $6, $3, 1
+; 32R6-NEXT:    andi $7, $2, 31
+; 32R6-NEXT:    xori $7, $7, 31
+; 32R6-NEXT:    sllv $6, $6, $7
 ; 32R6-NEXT:    srlv $3, $3, $2
-; 32R6-NEXT:    lw $6, 0($1)
-; 32R6-NEXT:    sll $8, $6, 1
-; 32R6-NEXT:    xori $9, $2, 31
-; 32R6-NEXT:    sllv $8, $8, $9
-; 32R6-NEXT:    or $3, $3, $8
-; 32R6-NEXT:    or $4, $7, $4
+; 32R6-NEXT:    lw $8, 0($1)
+; 32R6-NEXT:    sll $9, $8, 1
+; 32R6-NEXT:    sllv $9, $9, $7
+; 32R6-NEXT:    or $3, $3, $9
+; 32R6-NEXT:    or $4, $4, $6
 ; 32R6-NEXT:    lw $1, 12($1)
 ; 32R6-NEXT:    srlv $1, $1, $2
 ; 32R6-NEXT:    sll $5, $5, 1
-; 32R6-NEXT:    sllv $5, $5, $9
+; 32R6-NEXT:    sllv $5, $5, $7
 ; 32R6-NEXT:    or $5, $1, $5
-; 32R6-NEXT:    srav $2, $6, $2
+; 32R6-NEXT:    srav $2, $8, $2
 ; 32R6-NEXT:    jr $ra
 ; 32R6-NEXT:    addiu $sp, $sp, 32
 ;
@@ -656,53 +618,37 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    swl $7, 28($sp)
-; MMR3-NEXT:    swl $6, 24($sp)
-; MMR3-NEXT:    swl $5, 20($sp)
 ; MMR3-NEXT:    sra $1, $4, 31
-; MMR3-NEXT:    swl $4, 16($sp)
-; MMR3-NEXT:    swl $1, 12($sp)
-; MMR3-NEXT:    swl $1, 8($sp)
-; MMR3-NEXT:    swl $1, 4($sp)
-; MMR3-NEXT:    swl $1, 0($sp)
-; MMR3-NEXT:    swr $7, 31($sp)
-; MMR3-NEXT:    swr $6, 27($sp)
-; MMR3-NEXT:    swr $5, 23($sp)
-; MMR3-NEXT:    swr $4, 19($sp)
-; MMR3-NEXT:    swr $1, 15($sp)
-; MMR3-NEXT:    swr $1, 11($sp)
-; MMR3-NEXT:    swr $1, 7($sp)
-; MMR3-NEXT:    swr $1, 3($sp)
+; MMR3-NEXT:    swp $6, 24($sp)
+; MMR3-NEXT:    swp $4, 16($sp)
+; MMR3-NEXT:    sw $1, 12($sp)
+; MMR3-NEXT:    sw $1, 8($sp)
+; MMR3-NEXT:    sw $1, 4($sp)
+; MMR3-NEXT:    sw $1, 0($sp)
 ; MMR3-NEXT:    addiur1sp $2, 0
 ; MMR3-NEXT:    addiur2 $2, $2, 16
 ; MMR3-NEXT:    lw $3, 68($sp)
-; MMR3-NEXT:    ext $4, $3, 3, 4
-; MMR3-NEXT:    subu16 $2, $2, $4
-; MMR3-NEXT:    lwl $7, 4($2)
-; MMR3-NEXT:    lwr $7, 7($2)
-; MMR3-NEXT:    sll16 $4, $7, 1
-; MMR3-NEXT:    lwl $5, 8($2)
-; MMR3-NEXT:    lwr $5, 11($2)
-; MMR3-NEXT:    andi16 $6, $3, 7
-; MMR3-NEXT:    not16 $3, $6
-; MMR3-NEXT:    andi16 $3, $3, 31
-; MMR3-NEXT:    srlv $16, $5, $6
-; MMR3-NEXT:    sllv $4, $4, $3
-; MMR3-NEXT:    srlv $17, $7, $6
-; MMR3-NEXT:    lwl $7, 0($2)
-; MMR3-NEXT:    lwr $7, 3($2)
-; MMR3-NEXT:    sll16 $3, $7, 1
-; MMR3-NEXT:    xori $1, $6, 31
+; MMR3-NEXT:    srl16 $4, $3, 3
+; MMR3-NEXT:    andi $4, $4, 12
+; MMR3-NEXT:    subu16 $5, $2, $4
+; MMR3-NEXT:    lwp $6, 4($5)
+; MMR3-NEXT:    andi16 $2, $3, 31
+; MMR3-NEXT:    srlv $16, $7, $2
+; MMR3-NEXT:    sll16 $3, $6, 1
+; MMR3-NEXT:    xori $1, $2, 31
+; MMR3-NEXT:    sllv $4, $3, $1
+; MMR3-NEXT:    srlv $6, $6, $2
+; MMR3-NEXT:    lw16 $17, 0($5)
+; MMR3-NEXT:    sll16 $3, $17, 1
 ; MMR3-NEXT:    sllv $3, $3, $1
-; MMR3-NEXT:    or16 $3, $17
+; MMR3-NEXT:    or16 $3, $6
 ; MMR3-NEXT:    or16 $4, $16
-; MMR3-NEXT:    lwl $8, 12($2)
-; MMR3-NEXT:    lwr $8, 15($2)
-; MMR3-NEXT:    srlv $2, $8, $6
-; MMR3-NEXT:    sll16 $5, $5, 1
+; MMR3-NEXT:    lw16 $5, 12($5)
+; MMR3-NEXT:    srlv $6, $5, $2
+; MMR3-NEXT:    sll16 $5, $7, 1
 ; MMR3-NEXT:    sllv $5, $5, $1
-; MMR3-NEXT:    or16 $5, $2
-; MMR3-NEXT:    srav $2, $7, $6
+; MMR3-NEXT:    or16 $5, $6
+; MMR3-NEXT:    srav $2, $17, $2
 ; MMR3-NEXT:    lwp $16, 32($sp)
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
@@ -725,29 +671,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    addiu $2, $sp, 4
 ; MMR6-NEXT:    addiur2 $2, $2, 16
 ; MMR6-NEXT:    lw $3, 68($sp)
-; MMR6-NEXT:    ext $4, $3, 3, 4
-; MMR6-NEXT:    subu16 $5, $2, $4
-; MMR6-NEXT:    lw16 $4, 4($5)
-; MMR6-NEXT:    sll16 $6, $4, 1
-; MMR6-NEXT:    lw16 $7, 8($5)
-; MMR6-NEXT:    andi16 $2, $3, 7
-; MMR6-NEXT:    not16 $3, $2
-; MMR6-NEXT:    andi16 $3, $3, 31
-; MMR6-NEXT:    srlv $1, $7, $2
-; MMR6-NEXT:    sllv $6, $6, $3
-; MMR6-NEXT:    srlv $3, $4, $2
-; MMR6-NEXT:    lw16 $16, 0($5)
+; MMR6-NEXT:    srl16 $4, $3, 3
+; MMR6-NEXT:    andi $4, $4, 12
+; MMR6-NEXT:    subu16 $2, $2, $4
+; MMR6-NEXT:    lw16 $4, 4($2)
+; MMR6-NEXT:    lw16 $5, 8($2)
+; MMR6-NEXT:    andi16 $6, $3, 31
+; MMR6-NEXT:    srlv $1, $5, $6
+; MMR6-NEXT:    sll16 $3, $4, 1
+; MMR6-NEXT:    xori $7, $6, 31
+; MMR6-NEXT:    sllv $8, $3, $7
+; MMR6-NEXT:    srlv $3, $4, $6
+; MMR6-NEXT:    lw16 $16, 0($2)
 ; MMR6-NEXT:    sll16 $4, $16, 1
-; MMR6-NEXT:    xori $8, $2, 31
-; MMR6-NEXT:    sllv $4, $4, $8
+; MMR6-NEXT:    sllv $4, $4, $7
 ; MMR6-NEXT:    or $3, $3, $4
-; MMR6-NEXT:    or $4, $1, $6
-; MMR6-NEXT:    lw16 $5, 12($5)
-; MMR6-NEXT:    srlv $1, $5, $2
-; MMR6-NEXT:    sll16 $5, $7, 1
-; MMR6-NEXT:    sllv $5, $5, $8
-; MMR6-NEXT:    or $5, $1, $5
-; MMR6-NEXT:    srav $2, $16, $2
+; MMR6-NEXT:    or $4, $1, $8
+; MMR6-NEXT:    lw16 $2, 12($2)
+; MMR6-NEXT:    srlv $1, $2, $6
+; MMR6-NEXT:    sll16 $2, $5, 1
+; MMR6-NEXT:    sllv $2, $2, $7
+; MMR6-NEXT:    or $5, $1, $2
+; MMR6-NEXT:    srav $2, $16, $6
 ; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 03cf104e3120c..fa10293c0f6fb 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,52 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2:       # %bb.0: # %entry
 ; MIPS2-NEXT:    addiu $sp, $sp, -32
 ; MIPS2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS2-NEXT:    swl $7, 28($sp)
-; MIPS2-NEXT:    swl $6, 24($sp)
-; MIPS2-NEXT:    swl $5, 20($sp)
-; MIPS2-NEXT:    swl $4, 16($sp)
-; MIPS2-NEXT:    swl $zero, 12($sp)
-; MIPS2-NEXT:    swl $zero, 8($sp)
-; MIPS2-NEXT:    swl $zero, 4($sp)
-; MIPS2-NEXT:    swl $zero, 0($sp)
 ; MIPS2-NEXT:    addiu $1, $sp, 0
-; MIPS2-NEXT:    swr $7, 31($sp)
-; MIPS2-NEXT:    swr $6, 27($sp)
-; MIPS2-NEXT:    swr $5, 23($sp)
-; MIPS2-NEXT:    swr $4, 19($sp)
-; MIPS2-NEXT:    swr $zero, 15($sp)
-; MIPS2-NEXT:    swr $zero, 11($sp)
-; MIPS2-NEXT:    swr $zero, 7($sp)
-; MIPS2-NEXT:    swr $zero, 3($sp)
+; MIPS2-NEXT:    sw $7, 28($sp)
+; MIPS2-NEXT:    sw $6, 24($sp)
+; MIPS2-NEXT:    sw $5, 20($sp)
+; MIPS2-NEXT:    sw $4, 16($sp)
 ; MIPS2-NEXT:    addiu $1, $1, 16
 ; MIPS2-NEXT:    lw $2, 60($sp)
 ; MIPS2-NEXT:    srl $3, $2, 3
-; MIPS2-NEXT:    andi $3, $3, 15
+; MIPS2-NEXT:    andi $3, $3, 12
 ; MIPS2-NEXT:    subu $1, $1, $3
-; MIPS2-NEXT:    lwl $3, 4($1)
-; MIPS2-NEXT:    lwr $3, 7($1)
-; MIPS2-NEXT:    sll $4, $3, 1
-; MIPS2-NEXT:    lwl $5, 8($1)
-; MIPS2-NEXT:    lwr $5, 11($1)
-; MIPS2-NEXT:    andi $2, $2, 7
-; MIPS2-NEXT:    not $6, $2
-; MIPS2-NEXT:    srlv $7, $5, $2
-; MIPS2-NEXT:    sllv $4, $4, $6
+; MIPS2-NEXT:    sw $zero, 12($sp)
+; MIPS2-NEXT:    sw $zero, 8($sp)
+; MIPS2-NEXT:    sw $zero, 4($sp)
+; MIPS2-NEXT:    sw $zero, 0($sp)
+; MIPS2-NEXT:    lw $3, 4($1)
+; MIPS2-NEXT:    lw $5, 8($1)
+; MIPS2-NEXT:    srlv $4, $5, $2
+; MIPS2-NEXT:    sll $6, $3, 1
+; MIPS2-NEXT:    andi $7, $2, 31
+; MIPS2-NEXT:    xori $7, $7, 31
+; MIPS2-NEXT:    sllv $6, $6, $7
 ; MIPS2-NEXT:    srlv $3, $3, $2
-; MIPS2-NEXT:    lwl $6, 0($1)
-; MIPS2-NEXT:    lwr $6, 3($1)
-; MIPS2-NEXT:    sll $8, $6, 1
-; MIPS2-NEXT:    xori $9, $2, 31
-; MIPS2-NEXT:    sllv $8, $8, $9
-; MIPS2-NEXT:    or $3, $3, $8
-; MIPS2-NEXT:    or $4, $7, $4
-; MIPS2-NEXT:    lwl $7, 12($1)
-; MIPS2-NEXT:    lwr $7, 15($1)
-; MIPS2-NEXT:    srlv $1, $7, $2
+; MIPS2-NEXT:    lw $8, 0($1)
+; MIPS2-NEXT:    sll $9, $8, 1
+; MIPS2-NEXT:    sllv $9, $9, $7
+; MIPS2-NEXT:    or $3, $3, $9
+; MIPS2-NEXT:    or $4, $4, $6
+; MIPS2-NEXT:    lw $1, 12($1)
+; MIPS2-NEXT:    srlv $1, $1, $2
 ; MIPS2-NEXT:    sll $5, $5, 1
-; MIPS2-NEXT:    sllv $5, $5, $9
+; MIPS2-NEXT:    sllv $5, $5, $7
 ; MIPS2-NEXT:    or $5, $1, $5
-; MIPS2-NEXT:    srlv $2, $6, $2
+; MIPS2-NEXT:    srlv $2, $8, $2
 ; MIPS2-NEXT:    jr $ra
 ; MIPS2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -451,52 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    swl $7, 28($sp)
-; MIPS32-NEXT:    swl $6, 24($sp)
-; MIPS32-NEXT:    swl $5, 20($sp)
-; MIPS32-NEXT:    swl $4, 16($sp)
-; MIPS32-NEXT:    swl $zero, 12($sp)
-; MIPS32-NEXT:    swl $zero, 8($sp)
-; MIPS32-NEXT:    swl $zero, 4($sp)
-; MIPS32-NEXT:    swl $zero, 0($sp)
 ; MIPS32-NEXT:    addiu $1, $sp, 0
-; MIPS32-NEXT:    swr $7, 31($sp)
-; MIPS32-NEXT:    swr $6, 27($sp)
-; MIPS32-NEXT:    swr $5, 23($sp)
-; MIPS32-NEXT:    swr $4, 19($sp)
-; MIPS32-NEXT:    swr $zero, 15($sp)
-; MIPS32-NEXT:    swr $zero, 11($sp)
-; MIPS32-NEXT:    swr $zero, 7($sp)
-; MIPS32-NEXT:    swr $zero, 3($sp)
+; MIPS32-NEXT:    sw $7, 28($sp)
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    sw $5, 20($sp)
+; MIPS32-NEXT:    sw $4, 16($sp)
 ; MIPS32-NEXT:    addiu $1, $1, 16
 ; MIPS32-NEXT:    lw $2, 60($sp)
 ; MIPS32-NEXT:    srl $3, $2, 3
-; MIPS32-NEXT:    andi $3, $3, 15
+; MIPS32-NEXT:    andi $3, $3, 12
 ; MIPS32-NEXT:    subu $1, $1, $3
-; MIPS32-NEXT:    lwl $3, 4($1)
-; MIPS32-NEXT:    lwr $3, 7($1)
-; MIPS32-NEXT:    sll $4, $3, 1
-; MIPS32-NEXT:    lwl $5, 8($1)
-; MIPS32-NEXT:    lwr $5, 11($1)
-; MIPS32-NEXT:    andi $2, $2, 7
-; MIPS32-NEXT:    not $6, $2
-; MIPS32-NEXT:    srlv $7, $5, $2
-; MIPS32-NEXT:    sllv $4, $4, $6
+; MIPS32-NEXT:    sw $zero, 12($sp)
+; MIPS32-NEXT:    sw $zero, 8($sp)
+; MIPS32-NEXT:    sw $zero, 4($sp)
+; MIPS32-NEXT:    sw $zero, 0($sp)
+; MIPS32-NEXT:    lw $3, 4($1)
+; MIPS32-NEXT:    lw $5, 8($1)
+; MIPS32-NEXT:    srlv $4, $5, $2
+; MIPS32-NEXT:    sll $6, $3, 1
+; MIPS32-NEXT:    andi $7, $2, 31
+; MIPS32-NEXT:    xori $7, $7, 31
+; MIPS32-NEXT:    sllv $6, $6, $7
 ; MIPS32-NEXT:    srlv $3, $3, $2
-; MIPS32-NEXT:    lwl $6, 0($1)
-; MIPS32-NEXT:    lwr $6, 3($1)
-; MIPS32-NEXT:    sll $8, $6, 1
-; MIPS32-NEXT:    xori $9, $2, 31
-; MIPS32-NEXT:    sllv $8, $8, $9
-; MIPS32-NEXT:    or $3, $3, $8
-; MIPS32-NEXT:    or $4, $7, $4
-; MIPS32-NEXT:    lwl $7, 12($1)
-; MIPS32-NEXT:    lwr $7, 15($1)
-; MIPS32-NEXT:    srlv $1, $7, $2
+; MIPS32-NEXT:    lw $8, 0($1)
+; MIPS32-NEXT:    sll $9, $8, 1
+; MIPS32-NEXT:    sllv $9, $9, $7
+; MIPS32-NEXT:    or $3, $3, $9
+; MIPS32-NEXT:    or $4, $4, $6
+; MIPS32-NEXT:    lw $1, 12($1)
+; MIPS32-NEXT:    srlv $1, $1, $2
 ; MIPS32-NEXT:    sll $5, $5, 1
-; MIPS32-NEXT:    sllv $5, $5, $9
+; MIPS32-NEXT:    sllv $5, $5, $7
 ; MIPS32-NEXT:    or $5, $1, $5
-; MIPS32-NEXT:    srlv $2, $6, $2
+; MIPS32-NEXT:    srlv $2, $8, $2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
@@ -504,51 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT:    swl $7, 28($sp)
-; MIPS32R2-NEXT:    swl $6, 24($sp)
-; MIPS32R2-NEXT:    swl $5, 20($sp)
-; MIPS32R2-NEXT:    swl $4, 16($sp)
-; MIPS32R2-NEXT:    swl $zero, 12($sp)
-; MIPS32R2-NEXT:    swl $zero, 8($sp)
-; MIPS32R2-NEXT:    swl $zero, 4($sp)
-; MIPS32R2-NEXT:    swl $zero, 0($sp)
-; MIPS32R2-NEXT:    swr $7, 31($sp)
-; MIPS32R2-NEXT:    swr $6, 27($sp)
-; MIPS32R2-NEXT:    swr $5, 23($sp)
-; MIPS32R2-NEXT:    swr $4, 19($sp)
-; MIPS32R2-NEXT:    swr $zero, 15($sp)
-; MIPS32R2-NEXT:    swr $zero, 11($sp)
-; MIPS32R2-NEXT:    swr $zero, 7($sp)
-; MIPS32R2-NEXT:    swr $zero, 3($sp)
 ; MIPS32R2-NEXT:    addiu $1, $sp, 0
+; MIPS32R2-NEXT:    sw $7, 28($sp)
+; MIPS32R2-NEXT:    sw $6, 24($sp)
+; MIPS32R2-NEXT:    sw $5, 20($sp)
+; MIPS32R2-NEXT:    sw $4, 16($sp)
 ; MIPS32R2-NEXT:    addiu $1, $1, 16
 ; MIPS32R2-NEXT:    lw $2, 60($sp)
-; MIPS32R2-NEXT:    ext $3, $2, 3, 4
+; MIPS32R2-NEXT:    srl $3, $2, 3
+; MIPS32R2-NEXT:    andi $3, $3, 12
 ; MIPS32R2-NEXT:    subu $1, $1, $3
-; MIPS32R2-NEXT:    lwl $3, 4($1)
-; MIPS32R2-NEXT:    lwr $3, 7($1)
-; MIPS32R2-NEXT:    sll $4, $3, 1
-; MIPS32R2-NEXT:    lwl $5, 8($1)
-; MIPS32R2-NEXT:    lwr $5, 11($1)
-; MIPS32R2-NEXT:    andi $2, $2, 7
-; MIPS32R2-NEXT:    not $6, $2
-; MIPS32R2-NEXT:    srlv $7, $5, $2
-; MIPS32R2-NEXT:    sllv $4, $4, $6
+; MIPS32R2-NEXT:    sw $zero, 12($sp)
+; MIPS32R2-NEXT:    sw $zero, 8($sp)
+; MIPS32R2-NEXT:    sw $zero, 4($sp)
+; MIPS32R2-NEXT:    sw $zero, 0($sp)
+; MIPS32R2-NEXT:    lw $3, 4($1)
+; MIPS32R2-NEXT:    lw $5, 8($1)
+; MIPS32R2-NEXT:    srlv $4, $5, $2
+; MIPS32R2-NEXT:    sll $6, $3, 1
+; MIPS32R2-NEXT:    andi $7, $2, 31
+; MIPS32R2-NEXT:    xori $7, $7, 31
+; MIPS32R2-NEXT:    sllv $6, $6, $7
 ; MIPS32R2-NEXT:    srlv $3, $3, $2
-; MIPS32R2-NEXT:    lwl $6, 0($1)
-; MIPS32R2-NEXT:    lwr $6, 3($1)
-; MIPS32R2-NEXT:    sll $8, $6, 1
-; MIPS32R2-NEXT:    xori $9, $2, 31
-; MIPS32R2-NEXT:    sllv $8, $8, $9
-; MIPS32R2-NEXT:    or $3, $3, $8
-; MIPS32R2-NEXT:    or $4, $7, $4
-; MIPS32R2-NEXT:    lwl $7, 12($1)
-; MIPS32R2-NEXT:    lwr $7, 15($1)
-; MIPS32R2-NEXT:    srlv $1, $7, $2
+; MIPS32R2-NEXT:    lw $8, 0($1)
+; MIPS32R2-NEXT:    sll $9, $8, 1
+; MIPS32R2-NEXT:    sllv $9, $9, $7
+; MIPS32R2-NEXT:    or $3, $3, $9
+; MIPS32R2-NEXT:    or $4, $4, $6
+; MIPS32R2-NEXT:    lw $1, 12($1)
+; MIPS32R2-NEXT:    srlv $1, $1, $2
 ; MIPS32R2-NEXT:    sll $5, $5, 1
-; MIPS32R2-NEXT:    sllv $5, $5, $9
+; MIPS32R2-NEXT:    sllv $5, $5, $7
 ; MIPS32R2-NEXT:    or $5, $1, $5
-; MIPS32R2-NEXT:    srlv $2, $6, $2
+; MIPS32R2-NEXT:    srlv $2, $8, $2
 ; MIPS32R2-NEXT:    jr $ra
 ; MIPS32R2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -563,32 +525,32 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    sw $4, 16($sp)
 ; MIPS32R6-NEXT:    addiu $1, $1, 16
 ; MIPS32R6-NEXT:    lw $2, 60($sp)
-; MIPS32R6-NEXT:    ext $3, $2, 3, 4
+; MIPS32R6-NEXT:    srl $3, $2, 3
+; MIPS32R6-NEXT:    andi $3, $3, 12
 ; MIPS32R6-NEXT:    subu $1, $1, $3
 ; MIPS32R6-NEXT:    sw $zero, 12($sp)
 ; MIPS32R6-NEXT:    sw $zero, 8($sp)
 ; MIPS32R6-NEXT:    sw $zero, 4($sp)
 ; MIPS32R6-NEXT:    sw $zero, 0($sp)
 ; MIPS32R6-NEXT:    lw $3, 4($1)
-; MIPS32R6-NEXT:    sll $4, $3, 1
 ; MIPS32R6-NEXT:    lw $5, 8($1)
-; MIPS32R6-NEXT:    andi $2, $2, 7
-; MIPS32R6-NEXT:    not $6, $2
-; MIPS32R6-NEXT:    srlv $7, $5, $2
-; MIPS32R6-NEXT:    sllv $4, $4, $6
+; MIPS32R6-NEXT:    srlv $4, $5, $2
+; MIPS32R6-NEXT:    sll $6, $3, 1
+; MIPS32R6-NEXT:    andi $7, $2, 31
+; MIPS32R6-NEXT:    xori $7, $7, 31
+; MIPS32R6-NEXT:    sllv $6, $6, $7
 ; MIPS32R6-NEXT:    srlv $3, $3, $2
-; MIPS32R6-NEXT:    lw $6, 0($1)
-; MIPS32R6-NEXT:    sll $8, $6, 1
-; MIPS32R6-NEXT:    xori $9, $2, 31
-; MIPS32R6-NEXT:    sllv $8, $8, $9
-; MIPS32R6-NEXT:    or $3, $3, $8
-; MIPS32R6-NEXT:    or $4, $7, $4
+; MIPS32R6-NEXT:    lw $8, 0($1)
+; MIPS32R6-NEXT:    sll $9, $8, 1
+; MIPS32R6-NEXT:    sllv $9, $9, $7
+; MIPS32R6-NEXT:    or $3, $3, $9
+; MIPS32R6-NEXT:    or $4, $4, $6
 ; MIPS32R6-NEXT:    lw $1, 12($1)
 ; MIPS32R6-NEXT:    srlv $1, $1, $2
 ; MIPS32R6-NEXT:    sll $5, $5, 1
-; MIPS32R6-NEXT:    sllv $5, $5, $9
+; MIPS32R6-NEXT:    sllv $5, $5, $7
 ; MIPS32R6-NEXT:    or $5, $1, $5
-; MIPS32R6-NEXT:    srlv $2, $6, $2
+; MIPS32R6-NEXT:    srlv $2, $8, $2
 ; MIPS32R6-NEXT:    jr $ra
 ; MIPS32R6-NEXT:    addiu $sp, $sp, 32
 ;
@@ -677,53 +639,37 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    swl $7, 28($sp)
-; MMR3-NEXT:    swl $6, 24($sp)
-; MMR3-NEXT:    swl $5, 20($sp)
 ; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    swl $4, 16($sp)
-; MMR3-NEXT:    swl $2, 12($sp)
-; MMR3-NEXT:    swl $2, 8($sp)
-; MMR3-NEXT:    swl $2, 4($sp)
-; MMR3-NEXT:    swl $2, 0($sp)
-; MMR3-NEXT:    swr $7, 31($sp)
-; MMR3-NEXT:    swr $6, 27($sp)
-; MMR3-NEXT:    swr $5, 23($sp)
-; MMR3-NEXT:    swr $4, 19($sp)
-; MMR3-NEXT:    swr $2, 15($sp)
-; MMR3-NEXT:    swr $2, 11($sp)
-; MMR3-NEXT:    swr $2, 7($sp)
-; MMR3-NEXT:    swr $2, 3($sp)
+; MMR3-NEXT:    swp $6, 24($sp)
+; MMR3-NEXT:    swp $4, 16($sp)
+; MMR3-NEXT:    sw $2, 12($sp)
+; MMR3-NEXT:    sw $2, 8($sp)
+; MMR3-NEXT:    sw $2, 4($sp)
+; MMR3-NEXT:    sw $2, 0($sp)
 ; MMR3-NEXT:    addiur1sp $2, 0
 ; MMR3-NEXT:    addiur2 $2, $2, 16
 ; MMR3-NEXT:    lw $3, 68($sp)
-; MMR3-NEXT:    ext $4, $3, 3, 4
-; MMR3-NEXT:    subu16 $2, $2, $4
-; MMR3-NEXT:    lwl $7, 4($2)
-; MMR3-NEXT:    lwr $7, 7($2)
-; MMR3-NEXT:    sll16 $4, $7, 1
-; MMR3-NEXT:    lwl $5, 8($2)
-; MMR3-NEXT:    lwr $5, 11($2)
-; MMR3-NEXT:    andi16 $6, $3, 7
-; MMR3-NEXT:    not16 $3, $6
-; MMR3-NEXT:    andi16 $3, $3, 31
-; MMR3-NEXT:    srlv $16, $5, $6
-; MMR3-NEXT:    sllv $4, $4, $3
-; MMR3-NEXT:    srlv $17, $7, $6
-; MMR3-NEXT:    lwl $7, 0($2)
-; MMR3-NEXT:    lwr $7, 3($2)
-; MMR3-NEXT:    sll16 $3, $7, 1
-; MMR3-NEXT:    xori $1, $6, 31
+; MMR3-NEXT:    srl16 $4, $3, 3
+; MMR3-NEXT:    andi $4, $4, 12
+; MMR3-NEXT:    subu16 $5, $2, $4
+; MMR3-NEXT:    lwp $6, 4($5)
+; MMR3-NEXT:    andi16 $2, $3, 31
+; MMR3-NEXT:    srlv $16, $7, $2
+; MMR3-NEXT:    sll16 $3, $6, 1
+; MMR3-NEXT:    xori $1, $2, 31
+; MMR3-NEXT:    sllv $4, $3, $1
+; MMR3-NEXT:    srlv $6, $6, $2
+; MMR3-NEXT:    lw16 $17, 0($5)
+; MMR3-NEXT:    sll16 $3, $17, 1
 ; MMR3-NEXT:    sllv $3, $3, $1
-; MMR3-NEXT:    or16 $3, $17
+; MMR3-NEXT:    or16 $3, $6
 ; MMR3-NEXT:    or16 $4, $16
-; MMR3-NEXT:    lwl $8, 12($2)
-; MMR3-NEXT:    lwr $8, 15($2)
-; MMR3-NEXT:    srlv $2, $8, $6
-; MMR3-NEXT:    sll16 $5, $5, 1
+; MMR3-NEXT:    lw16 $5, 12($5)
+; MMR3-NEXT:    srlv $6, $5, $2
+; MMR3-NEXT:    sll16 $5, $7, 1
 ; MMR3-NEXT:    sllv $5, $5, $1
-; MMR3-NEXT:    or16 $5, $2
-; MMR3-NEXT:    srlv $2, $7, $6
+; MMR3-NEXT:    or16 $5, $6
+; MMR3-NEXT:    srlv $2, $17, $2
 ; MMR3-NEXT:    lwp $16, 32($sp)
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
@@ -746,29 +692,28 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    addiu $2, $sp, 4
 ; MMR6-NEXT:    addiur2 $2, $2, 16
 ; MMR6-NEXT:    lw $3, 68($sp)
-; MMR6-NEXT:    ext $4, $3, 3, 4
-; MMR6-NEXT:    subu16 $5, $2, $4
-; MMR6-NEXT:    lw16 $4, 4($5)
-; MMR6-NEXT:    sll16 $6, $4, 1
-; MMR6-NEXT:    lw16 $7, 8($5)
-; MMR6-NEXT:    andi16 $2, $3, 7
-; MMR6-NEXT:    not16 $3, $2
-; MMR6-NEXT:    andi16 $3, $3, 31
-; MMR6-NEXT:    srlv $1, $7, $2
-; MMR6-NEXT:    sllv $6, $6, $3
-; MMR6-NEXT:    srlv $3, $4, $2
-; MMR6-NEXT:    lw16 $16, 0($5)
+; MMR6-NEXT:    srl16 $4, $3, 3
+; MMR6-NEXT:    andi $4, $4, 12
+; MMR6-NEXT:    subu16 $2, $2, $4
+; MMR6-NEXT:    lw16 $4, 4($2)
+; MMR6-NEXT:    lw16 $5, 8($2)
+; MMR6-NEXT:    andi16 $6, $3, 31
+; MMR6-NEXT:    srlv $1, $5, $6
+; MMR6-NEXT:    sll16 $3, $4, 1
+; MMR6-NEXT:    xori $7, $6, 31
+; MMR6-NEXT:    sllv $8, $3, $7
+; MMR6-NEXT:    srlv $3, $4, $6
+; MMR6-NEXT:    lw16 $16, 0($2)
 ; MMR6-NEXT:    sll16 $4, $16, 1
-; MMR6-NEXT:    xori $8, $2, 31
-; MMR6-NEXT:    sllv $4, $4, $8
+; MMR6-NEXT:    sllv $4, $4, $7
 ; MMR6-NEXT:    or $3, $3, $4
-; MMR6-NEXT:    or $4, $1, $6
-; MMR6-NEXT:    lw16 $5, 12($5)
-; MMR6-NEXT:    srlv $1, $5, $2
-; MMR6-NEXT:    sll16 $5, $7, 1
-; MMR6-NEXT:    sllv $5, $5, $8
-; MMR6-NEXT:    or $5, $1, $5
-; MMR6-NEXT:    srlv $2, $16, $2
+; MMR6-NEXT:    or $4, $1, $8
+; MMR6-NEXT:    lw16 $2, 12($2)
+; MMR6-NEXT:    srlv $1, $2, $6
+; MMR6-NEXT:    sll16 $2, $5, 1
+; MMR6-NEXT:    sllv $2, $2, $7
+; MMR6-NEXT:    or $5, $1, $2
+; MMR6-NEXT:    srlv $2, $16, $6
 ; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 81f089a529470..394890a9dcc7c 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -440,49 +440,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2:       # %bb.0: # %entry
 ; MIPS2-NEXT:    addiu $sp, $sp, -32
 ; MIPS2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS2-NEXT:    swl $zero, 28($sp)
-; MIPS2-NEXT:    swl $zero, 24($sp)
-; MIPS2-NEXT:    swl $zero, 20($sp)
-; MIPS2-NEXT:    swl $zero, 16($sp)
-; MIPS2-NEXT:    swl $7, 12($sp)
-; MIPS2-NEXT:    swl $6, 8($sp)
-; MIPS2-NEXT:    swl $5, 4($sp)
-; MIPS2-NEXT:    swl $4, 0($sp)
-; MIPS2-NEXT:    swr $zero, 31($sp)
-; MIPS2-NEXT:    swr $zero, 27($sp)
-; MIPS2-NEXT:    swr $zero, 23($sp)
-; MIPS2-NEXT:    swr $zero, 19($sp)
-; MIPS2-NEXT:    swr $7, 15($sp)
-; MIPS2-NEXT:    swr $6, 11($sp)
-; MIPS2-NEXT:    swr $5, 7($sp)
-; MIPS2-NEXT:    swr $4, 3($sp)
 ; MIPS2-NEXT:    lw $1, 60($sp)
 ; MIPS2-NEXT:    srl $2, $1, 3
-; MIPS2-NEXT:    andi $2, $2, 15
+; MIPS2-NEXT:    sw $7, 12($sp)
+; MIPS2-NEXT:    sw $6, 8($sp)
+; MIPS2-NEXT:    sw $5, 4($sp)
+; MIPS2-NEXT:    sw $4, 0($sp)
+; MIPS2-NEXT:    andi $2, $2, 12
 ; MIPS2-NEXT:    addiu $3, $sp, 0
 ; MIPS2-NEXT:    addu $4, $3, $2
-; MIPS2-NEXT:    lwl $5, 8($4)
-; MIPS2-NEXT:    lwr $5, 11($4)
-; MIPS2-NEXT:    srl $2, $5, 1
-; MIPS2-NEXT:    lwl $3, 4($4)
-; MIPS2-NEXT:    lwr $3, 7($4)
-; MIPS2-NEXT:    andi $1, $1, 7
-; MIPS2-NEXT:    not $6, $1
-; MIPS2-NEXT:    sllv $7, $3, $1
-; MIPS2-NEXT:    srlv $6, $2, $6
-; MIPS2-NEXT:    lwl $2, 0($4)
-; MIPS2-NEXT:    lwr $2, 3($4)
-; MIPS2-NEXT:    sllv $2, $2, $1
-; MIPS2-NEXT:    srl $3, $3, 1
-; MIPS2-NEXT:    xori $8, $1, 31
-; MIPS2-NEXT:    srlv $3, $3, $8
-; MIPS2-NEXT:    or $2, $2, $3
-; MIPS2-NEXT:    or $3, $7, $6
+; MIPS2-NEXT:    sw $zero, 28($sp)
+; MIPS2-NEXT:    sw $zero, 24($sp)
+; MIPS2-NEXT:    sw $zero, 20($sp)
+; MIPS2-NEXT:    sw $zero, 16($sp)
+; MIPS2-NEXT:    lw $5, 8($4)
+; MIPS2-NEXT:    lw $2, 4($4)
+; MIPS2-NEXT:    sllv $3, $2, $1
+; MIPS2-NEXT:    srl $6, $5, 1
+; MIPS2-NEXT:    andi $7, $1, 31
+; MIPS2-NEXT:    xori $7, $7, 31
+; MIPS2-NEXT:    srlv $6, $6, $7
+; MIPS2-NEXT:    lw $8, 0($4)
+; MIPS2-NEXT:    sllv $8, $8, $1
+; MIPS2-NEXT:    srl $2, $2, 1
+; MIPS2-NEXT:    srlv $2, $2, $7
+; MIPS2-NEXT:    or $2, $8, $2
+; MIPS2-NEXT:    or $3, $3, $6
 ; MIPS2-NEXT:    sllv $5, $5, $1
-; MIPS2-NEXT:    lwl $6, 12($4)
-; MIPS2-NEXT:    lwr $6, 15($4)
+; MIPS2-NEXT:    lw $6, 12($4)
 ; MIPS2-NEXT:    srl $4, $6, 1
-; MIPS2-NEXT:    srlv $4, $4, $8
+; MIPS2-NEXT:    srlv $4, $4, $7
 ; MIPS2-NEXT:    or $4, $5, $4
 ; MIPS2-NEXT:    sllv $5, $6, $1
 ; MIPS2-NEXT:    jr $ra
@@ -492,49 +479,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    swl $zero, 28($sp)
-; MIPS32-NEXT:    swl $zero, 24($sp)
-; MIPS32-NEXT:    swl $zero, 20($sp)
-; MIPS32-NEXT:    swl $zero, 16($sp)
-; MIPS32-NEXT:    swl $7, 12($sp)
-; MIPS32-NEXT:    swl $6, 8($sp)
-; MIPS32-NEXT:    swl $5, 4($sp)
-; MIPS32-NEXT:    swl $4, 0($sp)
-; MIPS32-NEXT:    swr $zero, 31($sp)
-; MIPS32-NEXT:    swr $zero, 27($sp)
-; MIPS32-NEXT:    swr $zero, 23($sp)
-; MIPS32-NEXT:    swr $zero, 19($sp)
-; MIPS32-NEXT:    swr $7, 15($sp)
-; MIPS32-NEXT:    swr $6, 11($sp)
-; MIPS32-NEXT:    swr $5, 7($sp)
-; MIPS32-NEXT:    swr $4, 3($sp)
 ; MIPS32-NEXT:    lw $1, 60($sp)
 ; MIPS32-NEXT:    srl $2, $1, 3
-; MIPS32-NEXT:    andi $2, $2, 15
+; MIPS32-NEXT:    sw $7, 12($sp)
+; MIPS32-NEXT:    sw $6, 8($sp)
+; MIPS32-NEXT:    sw $5, 4($sp)
+; MIPS32-NEXT:    sw $4, 0($sp)
+; MIPS32-NEXT:    andi $2, $2, 12
 ; MIPS32-NEXT:    addiu $3, $sp, 0
 ; MIPS32-NEXT:    addu $4, $3, $2
-; MIPS32-NEXT:    lwl $5, 8($4)
-; MIPS32-NEXT:    lwr $5, 11($4)
-; MIPS32-NEXT:    srl $2, $5, 1
-; MIPS32-NEXT:    lwl $3, 4($4)
-; MIPS32-NEXT:    lwr $3, 7($4)
-; MIPS32-NEXT:    andi $1, $1, 7
-; MIPS32-NEXT:    not $6, $1
-; MIPS32-NEXT:    sllv $7, $3, $1
-; MIPS32-NEXT:    srlv $6, $2, $6
-; MIPS32-NEXT:    lwl $2, 0($4)
-; MIPS32-NEXT:    lwr $2, 3($4)
-; MIPS32-NEXT:    sllv $2, $2, $1
-; MIPS32-NEXT:    srl $3, $3, 1
-; MIPS32-NEXT:    xori $8, $1, 31
-; MIPS32-NEXT:    srlv $3, $3, $8
-; MIPS32-NEXT:    or $2, $2, $3
-; MIPS32-NEXT:    or $3, $7, $6
+; MIPS32-NEXT:    sw $zero, 28($sp)
+; MIPS32-NEXT:    sw $zero, 24($sp)
+; MIPS32-NEXT:    sw $zero, 20($sp)
+; MIPS32-NEXT:    sw $zero, 16($sp)
+; MIPS32-NEXT:    lw $5, 8($4)
+; MIPS32-NEXT:    lw $2, 4($4)
+; MIPS32-NEXT:    sllv $3, $2, $1
+; MIPS32-NEXT:    srl $6, $5, 1
+; MIPS32-NEXT:    andi $7, $1, 31
+; MIPS32-NEXT:    xori $7, $7, 31
+; MIPS32-NEXT:    srlv $6, $6, $7
+; MIPS32-NEXT:    lw $8, 0($4)
+; MIPS32-NEXT:    sllv $8, $8, $1
+; MIPS32-NEXT:    srl $2, $2, 1
+; MIPS32-NEXT:    srlv $2, $2, $7
+; MIPS32-NEXT:    or $2, $8, $2
+; MIPS32-NEXT:    or $3, $3, $6
 ; MIPS32-NEXT:    sllv $5, $5, $1
-; MIPS32-NEXT:    lwl $6, 12($4)
-; MIPS32-NEXT:    lwr $6, 15($4)
+; MIPS32-NEXT:    lw $6, 12($4)
 ; MIPS32-NEXT:    srl $4, $6, 1
-; MIPS32-NEXT:    srlv $4, $4, $8
+; MIPS32-NEXT:    srlv $4, $4, $7
 ; MIPS32-NEXT:    or $4, $5, $4
 ; MIPS32-NEXT:    sllv $5, $6, $1
 ; MIPS32-NEXT:    jr $ra
@@ -544,48 +518,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT:    swl $zero, 28($sp)
-; MIPS32R2-NEXT:    swl $zero, 24($sp)
-; MIPS32R2-NEXT:    swl $zero, 20($sp)
-; MIPS32R2-NEXT:    swl $zero, 16($sp)
-; MIPS32R2-NEXT:    swl $7, 12($sp)
-; MIPS32R2-NEXT:    swl $6, 8($sp)
-; MIPS32R2-NEXT:    swl $5, 4($sp)
-; MIPS32R2-NEXT:    swl $4, 0($sp)
-; MIPS32R2-NEXT:    swr $zero, 31($sp)
-; MIPS32R2-NEXT:    swr $zero, 27($sp)
-; MIPS32R2-NEXT:    swr $zero, 23($sp)
-; MIPS32R2-NEXT:    swr $zero, 19($sp)
-; MIPS32R2-NEXT:    swr $7, 15($sp)
-; MIPS32R2-NEXT:    swr $6, 11($sp)
-; MIPS32R2-NEXT:    swr $5, 7($sp)
-; MIPS32R2-NEXT:    swr $4, 3($sp)
 ; MIPS32R2-NEXT:    lw $1, 60($sp)
-; MIPS32R2-NEXT:    ext $2, $1, 3, 4
+; MIPS32R2-NEXT:    srl $2, $1, 3
+; MIPS32R2-NEXT:    sw $7, 12($sp)
+; MIPS32R2-NEXT:    sw $6, 8($sp)
+; MIPS32R2-NEXT:    sw $5, 4($sp)
+; MIPS32R2-NEXT:    sw $4, 0($sp)
+; MIPS32R2-NEXT:    andi $2, $2, 12
 ; MIPS32R2-NEXT:    addiu $3, $sp, 0
 ; MIPS32R2-NEXT:    addu $4, $3, $2
-; MIPS32R2-NEXT:    lwl $5, 8($4)
-; MIPS32R2-NEXT:    lwr $5, 11($4)
-; MIPS32R2-NEXT:    srl $2, $5, 1
-; MIPS32R2-NEXT:    lwl $3, 4($4)
-; MIPS32R2-NEXT:    lwr $3, 7($4)
-; MIPS32R2-NEXT:    andi $1, $1, 7
-; MIPS32R2-NEXT:    not $6, $1
-; MIPS32R2-NEXT:    sllv $7, $3, $1
-; MIPS32R2-NEXT:    srlv $6, $2, $6
-; MIPS32R2-NEXT:    lwl $2, 0($4)
-; MIPS32R2-NEXT:    lwr $2, 3($4)
-; MIPS32R2-NEXT:    sllv $2, $2, $1
-; MIPS32R2-NEXT:    srl $3, $3, 1
-; MIPS32R2-NEXT:    xori $8, $1, 31
-; MIPS32R2-NEXT:    srlv $3, $3, $8
-; MIPS32R2-NEXT:    or $2, $2, $3
-; MIPS32R2-NEXT:    or $3, $7, $6
+; MIPS32R2-NEXT:    sw $zero, 28($sp)
+; MIPS32R2-NEXT:    sw $zero, 24($sp)
+; MIPS32R2-NEXT:    sw $zero, 20($sp)
+; MIPS32R2-NEXT:    sw $zero, 16($sp)
+; MIPS32R2-NEXT:    lw $5, 8($4)
+; MIPS32R2-NEXT:    lw $2, 4($4)
+; MIPS32R2-NEXT:    sllv $3, $2, $1
+; MIPS32R2-NEXT:    srl $6, $5, 1
+; MIPS32R2-NEXT:    andi $7, $1, 31
+; MIPS32R2-NEXT:    xori $7, $7, 31
+; MIPS32R2-NEXT:    srlv $6, $6, $7
+; MIPS32R2-NEXT:    lw $8, 0($4)
+; MIPS32R2-NEXT:    sllv $8, $8, $1
+; MIPS32R2-NEXT:    srl $2, $2, 1
+; MIPS32R2-NEXT:    srlv $2, $2, $7
+; MIPS32R2-NEXT:    or $2, $8, $2
+; MIPS32R2-NEXT:    or $3, $3, $6
 ; MIPS32R2-NEXT:    sllv $5, $5, $1
-; MIPS32R2-NEXT:    lwl $6, 12($4)
-; MIPS32R2-NEXT:    lwr $6, 15($4)
+; MIPS32R2-NEXT:    lw $6, 12($4)
 ; MIPS32R2-NEXT:    srl $4, $6, 1
-; MIPS32R2-NEXT:    srlv $4, $4, $8
+; MIPS32R2-NEXT:    srlv $4, $4, $7
 ; MIPS32R2-NEXT:    or $4, $5, $4
 ; MIPS32R2-NEXT:    sllv $5, $6, $1
 ; MIPS32R2-NEXT:    jr $ra
@@ -596,11 +558,12 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R6-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R6-NEXT:    lw $1, 60($sp)
+; MIPS32R6-NEXT:    srl $2, $1, 3
 ; MIPS32R6-NEXT:    sw $7, 12($sp)
 ; MIPS32R6-NEXT:    sw $6, 8($sp)
 ; MIPS32R6-NEXT:    sw $5, 4($sp)
 ; MIPS32R6-NEXT:    sw $4, 0($sp)
-; MIPS32R6-NEXT:    ext $2, $1, 3, 4
+; MIPS32R6-NEXT:    andi $2, $2, 12
 ; MIPS32R6-NEXT:    addiu $3, $sp, 0
 ; MIPS32R6-NEXT:    addu $4, $3, $2
 ; MIPS32R6-NEXT:    sw $zero, 28($sp)
@@ -608,23 +571,22 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    sw $zero, 20($sp)
 ; MIPS32R6-NEXT:    sw $zero, 16($sp)
 ; MIPS32R6-NEXT:    lw $5, 8($4)
-; MIPS32R6-NEXT:    srl $2, $5, 1
-; MIPS32R6-NEXT:    lw $3, 4($4)
-; MIPS32R6-NEXT:    andi $1, $1, 7
-; MIPS32R6-NEXT:    not $6, $1
-; MIPS32R6-NEXT:    sllv $7, $3, $1
-; MIPS32R6-NEXT:    srlv $6, $2, $6
-; MIPS32R6-NEXT:    lw $2, 0($4)
-; MIPS32R6-NEXT:    sllv $2, $2, $1
-; MIPS32R6-NEXT:    srl $3, $3, 1
-; MIPS32R6-NEXT:    xori $8, $1, 31
-; MIPS32R6-NEXT:    srlv $3, $3, $8
-; MIPS32R6-NEXT:    or $2, $2, $3
-; MIPS32R6-NEXT:    or $3, $7, $6
+; MIPS32R6-NEXT:    lw $2, 4($4)
+; MIPS32R6-NEXT:    sllv $3, $2, $1
+; MIPS32R6-NEXT:    srl $6, $5, 1
+; MIPS32R6-NEXT:    andi $7, $1, 31
+; MIPS32R6-NEXT:    xori $7, $7, 31
+; MIPS32R6-NEXT:    srlv $6, $6, $7
+; MIPS32R6-NEXT:    lw $8, 0($4)
+; MIPS32R6-NEXT:    sllv $8, $8, $1
+; MIPS32R6-NEXT:    srl $2, $2, 1
+; MIPS32R6-NEXT:    srlv $2, $2, $7
+; MIPS32R6-NEXT:    or $2, $8, $2
+; MIPS32R6-NEXT:    or $3, $3, $6
 ; MIPS32R6-NEXT:    sllv $5, $5, $1
 ; MIPS32R6-NEXT:    lw $6, 12($4)
 ; MIPS32R6-NEXT:    srl $4, $6, 1
-; MIPS32R6-NEXT:    srlv $4, $4, $8
+; MIPS32R6-NEXT:    srlv $4, $4, $7
 ; MIPS32R6-NEXT:    or $4, $5, $4
 ; MIPS32R6-NEXT:    sllv $5, $6, $1
 ; MIPS32R6-NEXT:    jr $ra
@@ -722,47 +684,32 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
 ; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    swl $2, 28($sp)
-; MMR3-NEXT:    swl $2, 24($sp)
-; MMR3-NEXT:    swl $2, 20($sp)
-; MMR3-NEXT:    swl $2, 16($sp)
-; MMR3-NEXT:    swl $7, 12($sp)
-; MMR3-NEXT:    swl $6, 8($sp)
-; MMR3-NEXT:    swl $5, 4($sp)
-; MMR3-NEXT:    swl $4, 0($sp)
-; MMR3-NEXT:    swr $2, 31($sp)
-; MMR3-NEXT:    swr $2, 27($sp)
-; MMR3-NEXT:    swr $2, 23($sp)
-; MMR3-NEXT:    swr $2, 19($sp)
-; MMR3-NEXT:    swr $7, 15($sp)
-; MMR3-NEXT:    swr $6, 11($sp)
-; MMR3-NEXT:    swr $5, 7($sp)
-; MMR3-NEXT:    swr $4, 3($sp)
+; MMR3-NEXT:    sw $2, 28($sp)
+; MMR3-NEXT:    sw $2, 24($sp)
+; MMR3-NEXT:    sw $2, 20($sp)
+; MMR3-NEXT:    sw $2, 16($sp)
+; MMR3-NEXT:    swp $6, 8($sp)
+; MMR3-NEXT:    swp $4, 0($sp)
 ; MMR3-NEXT:    lw $2, 68($sp)
-; MMR3-NEXT:    ext $3, $2, 3, 4
+; MMR3-NEXT:    srl16 $3, $2, 3
+; MMR3-NEXT:    andi $3, $3, 12
 ; MMR3-NEXT:    addiur1sp $4, 0
 ; MMR3-NEXT:    addu16 $4, $4, $3
-; MMR3-NEXT:    lwl $6, 8($4)
-; MMR3-NEXT:    lwr $6, 11($4)
-; MMR3-NEXT:    srl16 $3, $6, 1
-; MMR3-NEXT:    lwl $7, 4($4)
-; MMR3-NEXT:    lwr $7, 7($4)
-; MMR3-NEXT:    andi16 $5, $2, 7
-; MMR3-NEXT:    not16 $2, $5
-; MMR3-NEXT:    andi16 $2, $2, 31
+; MMR3-NEXT:    lw16 $6, 8($4)
+; MMR3-NEXT:    lw16 $7, 4($4)
+; MMR3-NEXT:    andi16 $5, $2, 31
 ; MMR3-NEXT:    sllv $16, $7, $5
-; MMR3-NEXT:    srlv $3, $3, $2
-; MMR3-NEXT:    lwl $1, 0($4)
-; MMR3-NEXT:    lwr $1, 3($4)
-; MMR3-NEXT:    sllv $17, $1, $5
-; MMR3-NEXT:    srl16 $2, $7, 1
+; MMR3-NEXT:    srl16 $2, $6, 1
 ; MMR3-NEXT:    xori $1, $5, 31
+; MMR3-NEXT:    srlv $3, $2, $1
+; MMR3-NEXT:    lw16 $2, 0($4)
+; MMR3-NEXT:    sllv $17, $2, $5
+; MMR3-NEXT:    srl16 $2, $7, 1
 ; MMR3-NEXT:    srlv $2, $2, $1
 ; MMR3-NEXT:    or16 $2, $17
 ; MMR3-NEXT:    or16 $3, $16
 ; MMR3-NEXT:    sllv $6, $6, $5
-; MMR3-NEXT:    lwl $7, 12($4)
-; MMR3-NEXT:    lwr $7, 15($4)
+; MMR3-NEXT:    lw16 $7, 12($4)
 ; MMR3-NEXT:    srl16 $4, $7, 1
 ; MMR3-NEXT:    srlv $4, $4, $1
 ; MMR3-NEXT:    or16 $4, $6
@@ -785,30 +732,29 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    sw $5, 4($sp)
 ; MMR6-NEXT:    sw $4, 0($sp)
 ; MMR6-NEXT:    lw $2, 60($sp)
-; MMR6-NEXT:    ext $3, $2, 3, 4
+; MMR6-NEXT:    srl16 $3, $2, 3
+; MMR6-NEXT:    andi $3, $3, 12
 ; MMR6-NEXT:    addiu $4, $sp, 0
 ; MMR6-NEXT:    addu16 $4, $4, $3
-; MMR6-NEXT:    lw16 $6, 8($4)
-; MMR6-NEXT:    srl16 $3, $6, 1
-; MMR6-NEXT:    lw16 $7, 4($4)
-; MMR6-NEXT:    andi16 $5, $2, 7
-; MMR6-NEXT:    not16 $2, $5
-; MMR6-NEXT:    andi16 $2, $2, 31
-; MMR6-NEXT:    sllv $1, $7, $5
-; MMR6-NEXT:    srlv $3, $3, $2
+; MMR6-NEXT:    lw16 $5, 8($4)
+; MMR6-NEXT:    lw16 $3, 4($4)
+; MMR6-NEXT:    andi16 $6, $2, 31
+; MMR6-NEXT:    sllv $1, $3, $6
+; MMR6-NEXT:    srl16 $2, $5, 1
+; MMR6-NEXT:    xori $7, $6, 31
+; MMR6-NEXT:    srlv $8, $2, $7
 ; MMR6-NEXT:    lw16 $2, 0($4)
-; MMR6-NEXT:    sllv $2, $2, $5
-; MMR6-NEXT:    srl16 $7, $7, 1
-; MMR6-NEXT:    xori $8, $5, 31
-; MMR6-NEXT:    srlv $7, $7, $8
-; MMR6-NEXT:    or $2, $2, $7
-; MMR6-NEXT:    or $3, $1, $3
-; MMR6-NEXT:    sllv $1, $6, $5
-; MMR6-NEXT:    lw16 $6, 12($4)
-; MMR6-NEXT:    srl16 $4, $6, 1
-; MMR6-NEXT:    srlv $4, $4, $8
+; MMR6-NEXT:    sllv $2, $2, $6
+; MMR6-NEXT:    srl16 $3, $3, 1
+; MMR6-NEXT:    srlv $3, $3, $7
+; MMR6-NEXT:    or $2, $2, $3
+; MMR6-NEXT:    or $3, $1, $8
+; MMR6-NEXT:    sllv $1, $5, $6
+; MMR6-NEXT:    lw16 $5, 12($4)
+; MMR6-NEXT:    srl16 $4, $5, 1
+; MMR6-NEXT:    srlv $4, $4, $7
 ; MMR6-NEXT:    or $4, $1, $4
-; MMR6-NEXT:    sllv $5, $6, $5
+; MMR6-NEXT:    sllv $5, $5, $6
 ; MMR6-NEXT:    addiu $sp, $sp, 32
 ; MMR6-NEXT:    jrc $ra
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index c48361e0a8035..ae25feeb8893c 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -8,58 +8,52 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    stwu 1, -64(1)
-; CHECK-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 8, 2048
 ; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 2048
 ; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 7, 7
-; CHECK-NEXT:    mtctr 8
-; CHECK-NEXT:    addi 8, 1, 16
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    mtctr 7
+; CHECK-NEXT:    addi 7, 1, 20
 ; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 9, 0(4)
-; CHECK-NEXT:    lwz 10, 4(4)
-; CHECK-NEXT:    lwz 11, 8(4)
-; CHECK-NEXT:    lwz 12, 12(4)
-; CHECK-NEXT:    lwz 0, 12(5)
+; CHECK-NEXT:    lwz 8, 0(4)
+; CHECK-NEXT:    lwz 9, 4(4)
+; CHECK-NEXT:    lwz 10, 8(4)
+; CHECK-NEXT:    lwz 11, 12(4)
+; CHECK-NEXT:    lwz 12, 12(5)
+; CHECK-NEXT:    stw 6, 48(1)
 ; CHECK-NEXT:    stw 6, 44(1)
 ; CHECK-NEXT:    stw 6, 40(1)
 ; CHECK-NEXT:    stw 6, 36(1)
-; CHECK-NEXT:    stw 6, 32(1)
-; CHECK-NEXT:    stw 12, 28(1)
-; CHECK-NEXT:    clrlwi 12, 0, 29
-; CHECK-NEXT:    stw 11, 24(1)
-; CHECK-NEXT:    nand 11, 0, 7
-; CHECK-NEXT:    stw 10, 20(1)
-; CHECK-NEXT:    subfic 29, 12, 32
-; CHECK-NEXT:    stw 9, 16(1)
-; CHECK-NEXT:    rlwinm 9, 0, 29, 28, 31
-; CHECK-NEXT:    lwzux 10, 9, 8
-; CHECK-NEXT:    clrlwi 11, 11, 27
-; CHECK-NEXT:    lwz 0, 8(9)
-; CHECK-NEXT:    slw 10, 10, 12
-; CHECK-NEXT:    lwz 30, 4(9)
-; CHECK-NEXT:    lwz 9, 12(9)
-; CHECK-NEXT:    slw 28, 30, 12
-; CHECK-NEXT:    srw 30, 30, 29
-; CHECK-NEXT:    srw 29, 9, 29
-; CHECK-NEXT:    slw 9, 9, 12
-; CHECK-NEXT:    slw 12, 0, 12
-; CHECK-NEXT:    srwi 0, 0, 1
-; CHECK-NEXT:    stw 9, 12(3)
-; CHECK-NEXT:    or 9, 12, 29
-; CHECK-NEXT:    srw 11, 0, 11
-; CHECK-NEXT:    stw 9, 8(3)
-; CHECK-NEXT:    or 9, 10, 30
-; CHECK-NEXT:    stw 9, 0(3)
-; CHECK-NEXT:    or 9, 28, 11
-; CHECK-NEXT:    stw 9, 4(3)
+; CHECK-NEXT:    stw 11, 32(1)
+; CHECK-NEXT:    stw 10, 28(1)
+; CHECK-NEXT:    clrlwi 10, 12, 27
+; CHECK-NEXT:    stw 9, 24(1)
+; CHECK-NEXT:    stw 8, 20(1)
+; CHECK-NEXT:    rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT:    lwzux 9, 8, 7
+; CHECK-NEXT:    subfic 12, 10, 32
+; CHECK-NEXT:    lwz 11, 8(8)
+; CHECK-NEXT:    slw 9, 9, 10
+; CHECK-NEXT:    lwz 0, 4(8)
+; CHECK-NEXT:    lwz 8, 12(8)
+; CHECK-NEXT:    srw 30, 11, 12
+; CHECK-NEXT:    slw 29, 0, 10
+; CHECK-NEXT:    srw 0, 0, 12
+; CHECK-NEXT:    srw 12, 8, 12
+; CHECK-NEXT:    slw 11, 11, 10
+; CHECK-NEXT:    slw 8, 8, 10
+; CHECK-NEXT:    stw 8, 12(3)
+; CHECK-NEXT:    or 8, 11, 12
+; CHECK-NEXT:    stw 8, 8(3)
+; CHECK-NEXT:    or 8, 9, 0
+; CHECK-NEXT:    stw 8, 0(3)
+; CHECK-NEXT:    or 8, 29, 30
+; CHECK-NEXT:    stw 8, 4(3)
 ; CHECK-NEXT:    bdnz .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:
@@ -83,59 +77,53 @@ for.end:                                          ; preds = %for.body
 define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stwu 1, -64(1)
-; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 7, 2048
-; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 7
-; CHECK-NEXT:    mtctr 7
-; CHECK-NEXT:    addi 7, 1, 36
+; CHECK-NEXT:    stwu 1, -48(1)
+; CHECK-NEXT:    stw 30, 40(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 6, 2048
+; CHECK-NEXT:    mtctr 6
+; CHECK-NEXT:    addi 6, 1, 24
 ; CHECK-NEXT:  .LBB1_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 8, 0(4)
-; CHECK-NEXT:    lwz 10, 8(4)
-; CHECK-NEXT:    lwz 12, 12(5)
-; CHECK-NEXT:    lwz 9, 4(4)
-; CHECK-NEXT:    lwz 11, 12(4)
-; CHECK-NEXT:    stw 10, 44(1)
-; CHECK-NEXT:    rlwinm 10, 12, 29, 28, 31
-; CHECK-NEXT:    stw 8, 36(1)
-; CHECK-NEXT:    srawi 8, 8, 31
-; CHECK-NEXT:    stw 11, 48(1)
-; CHECK-NEXT:    clrlwi 11, 12, 29
-; CHECK-NEXT:    stw 9, 40(1)
-; CHECK-NEXT:    nand 9, 12, 6
-; CHECK-NEXT:    stw 8, 32(1)
-; CHECK-NEXT:    subfic 30, 11, 32
+; CHECK-NEXT:    lwz 7, 0(4)
+; CHECK-NEXT:    lwz 8, 4(4)
+; CHECK-NEXT:    lwz 11, 12(5)
+; CHECK-NEXT:    lwz 9, 8(4)
+; CHECK-NEXT:    lwz 10, 12(4)
 ; CHECK-NEXT:    stw 8, 28(1)
-; CHECK-NEXT:    clrlwi 9, 9, 27
-; CHECK-NEXT:    stw 8, 24(1)
-; CHECK-NEXT:    stw 8, 20(1)
-; CHECK-NEXT:    sub 8, 7, 10
-; CHECK-NEXT:    lwz 10, 4(8)
-; CHECK-NEXT:    lwz 12, 8(8)
-; CHECK-NEXT:    lwz 0, 0(8)
-; CHECK-NEXT:    lwz 8, 12(8)
-; CHECK-NEXT:    srw 29, 12, 11
-; CHECK-NEXT:    slw 12, 12, 30
-; CHECK-NEXT:    slw 30, 0, 30
-; CHECK-NEXT:    srw 8, 8, 11
-; CHECK-NEXT:    sraw 0, 0, 11
-; CHECK-NEXT:    srw 11, 10, 11
-; CHECK-NEXT:    slwi 10, 10, 1
-; CHECK-NEXT:    or 8, 12, 8
-; CHECK-NEXT:    slw 9, 10, 9
-; CHECK-NEXT:    stw 8, 12(3)
-; CHECK-NEXT:    or 8, 30, 11
-; CHECK-NEXT:    stw 8, 4(3)
-; CHECK-NEXT:    or 8, 29, 9
-; CHECK-NEXT:    stw 0, 0(3)
-; CHECK-NEXT:    stw 8, 8(3)
+; CHECK-NEXT:    rlwinm 8, 11, 29, 28, 29
+; CHECK-NEXT:    stw 7, 24(1)
+; CHECK-NEXT:    srawi 7, 7, 31
+; CHECK-NEXT:    stw 10, 36(1)
+; CHECK-NEXT:    clrlwi 10, 11, 27
+; CHECK-NEXT:    stw 9, 32(1)
+; CHECK-NEXT:    subfic 12, 10, 32
+; CHECK-NEXT:    stw 7, 20(1)
+; CHECK-NEXT:    stw 7, 16(1)
+; CHECK-NEXT:    stw 7, 12(1)
+; CHECK-NEXT:    stw 7, 8(1)
+; CHECK-NEXT:    sub 7, 6, 8
+; CHECK-NEXT:    lwz 8, 4(7)
+; CHECK-NEXT:    lwz 9, 0(7)
+; CHECK-NEXT:    lwz 11, 12(7)
+; CHECK-NEXT:    srw 0, 8, 10
+; CHECK-NEXT:    lwz 7, 8(7)
+; CHECK-NEXT:    slw 30, 9, 12
+; CHECK-NEXT:    slw 8, 8, 12
+; CHECK-NEXT:    srw 11, 11, 10
+; CHECK-NEXT:    slw 12, 7, 12
+; CHECK-NEXT:    srw 7, 7, 10
+; CHECK-NEXT:    or 7, 8, 7
+; CHECK-NEXT:    stw 7, 8(3)
+; CHECK-NEXT:    or 7, 12, 11
+; CHECK-NEXT:    sraw 9, 9, 10
+; CHECK-NEXT:    stw 7, 12(3)
+; CHECK-NEXT:    or 7, 30, 0
+; CHECK-NEXT:    stw 9, 0(3)
+; CHECK-NEXT:    stw 7, 4(3)
 ; CHECK-NEXT:    bdnz .LBB1_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
-; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 64
+; CHECK-NEXT:    lwz 30, 40(1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 48
 ; CHECK-NEXT:    blr
 entry:
   br label %for.body
@@ -159,59 +147,53 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo3:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    stwu 1, -64(1)
-; CHECK-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 8, 2048
 ; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 2048
 ; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 7, 7
-; CHECK-NEXT:    mtctr 8
-; CHECK-NEXT:    addi 8, 1, 32
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    mtctr 7
+; CHECK-NEXT:    addi 7, 1, 36
 ; CHECK-NEXT:  .LBB2_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 10, 4(4)
-; CHECK-NEXT:    lwz 0, 12(5)
-; CHECK-NEXT:    lwz 9, 0(4)
-; CHECK-NEXT:    lwz 11, 8(4)
-; CHECK-NEXT:    lwz 12, 12(4)
-; CHECK-NEXT:    stw 10, 36(1)
-; CHECK-NEXT:    rlwinm 10, 0, 29, 28, 31
+; CHECK-NEXT:    lwz 8, 0(4)
+; CHECK-NEXT:    lwz 12, 12(5)
+; CHECK-NEXT:    lwz 9, 4(4)
+; CHECK-NEXT:    lwz 10, 8(4)
+; CHECK-NEXT:    lwz 11, 12(4)
+; CHECK-NEXT:    stw 8, 36(1)
+; CHECK-NEXT:    rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT:    stw 6, 32(1)
+; CHECK-NEXT:    sub 8, 7, 8
 ; CHECK-NEXT:    stw 6, 28(1)
-; CHECK-NEXT:    sub 10, 8, 10
 ; CHECK-NEXT:    stw 6, 24(1)
 ; CHECK-NEXT:    stw 6, 20(1)
-; CHECK-NEXT:    stw 6, 16(1)
-; CHECK-NEXT:    stw 12, 44(1)
-; CHECK-NEXT:    clrlwi 12, 0, 29
-; CHECK-NEXT:    stw 11, 40(1)
-; CHECK-NEXT:    subfic 29, 12, 32
-; CHECK-NEXT:    stw 9, 32(1)
-; CHECK-NEXT:    nand 9, 0, 7
-; CHECK-NEXT:    lwz 11, 4(10)
-; CHECK-NEXT:    clrlwi 9, 9, 27
-; CHECK-NEXT:    lwz 0, 8(10)
-; CHECK-NEXT:    lwz 30, 0(10)
-; CHECK-NEXT:    lwz 10, 12(10)
-; CHECK-NEXT:    srw 28, 0, 12
-; CHECK-NEXT:    slw 0, 0, 29
-; CHECK-NEXT:    slw 29, 30, 29
-; CHECK-NEXT:    srw 10, 10, 12
-; CHECK-NEXT:    srw 30, 30, 12
-; CHECK-NEXT:    srw 12, 11, 12
-; CHECK-NEXT:    slwi 11, 11, 1
-; CHECK-NEXT:    slw 9, 11, 9
-; CHECK-NEXT:    or 10, 0, 10
-; CHECK-NEXT:    stw 10, 12(3)
-; CHECK-NEXT:    or 10, 29, 12
-; CHECK-NEXT:    or 9, 28, 9
-; CHECK-NEXT:    stw 30, 0(3)
-; CHECK-NEXT:    stw 10, 4(3)
-; CHECK-NEXT:    stw 9, 8(3)
+; CHECK-NEXT:    stw 11, 48(1)
+; CHECK-NEXT:    clrlwi 11, 12, 27
+; CHECK-NEXT:    stw 10, 44(1)
+; CHECK-NEXT:    subfic 0, 11, 32
+; CHECK-NEXT:    stw 9, 40(1)
+; CHECK-NEXT:    lwz 9, 4(8)
+; CHECK-NEXT:    lwz 10, 0(8)
+; CHECK-NEXT:    lwz 12, 12(8)
+; CHECK-NEXT:    srw 30, 9, 11
+; CHECK-NEXT:    lwz 8, 8(8)
+; CHECK-NEXT:    slw 29, 10, 0
+; CHECK-NEXT:    slw 9, 9, 0
+; CHECK-NEXT:    srw 12, 12, 11
+; CHECK-NEXT:    slw 0, 8, 0
+; CHECK-NEXT:    srw 8, 8, 11
+; CHECK-NEXT:    or 8, 9, 8
+; CHECK-NEXT:    stw 8, 8(3)
+; CHECK-NEXT:    or 8, 0, 12
+; CHECK-NEXT:    srw 10, 10, 11
+; CHECK-NEXT:    stw 8, 12(3)
+; CHECK-NEXT:    or 8, 29, 30
+; CHECK-NEXT:    stw 10, 0(3)
+; CHECK-NEXT:    stw 8, 4(3)
 ; CHECK-NEXT:    bdnz .LBB2_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index 3e328c6ad9f0b..cc90300aafcea 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -33,36 +33,34 @@ define void @pr59074(ptr %0) {
 ; LE32-NEXT:    li 8, 12
 ; LE32-NEXT:    xxswapd 0, 0
 ; LE32-NEXT:    addi 4, 4, -12
-; LE32-NEXT:    rlwinm 9, 4, 29, 28, 31
+; LE32-NEXT:    rlwinm 9, 4, 29, 28, 29
 ; LE32-NEXT:    stxvd2x 0, 6, 5
 ; LE32-NEXT:    stw 7, 44(1)
 ; LE32-NEXT:    stw 7, 40(1)
 ; LE32-NEXT:    stw 7, 36(1)
 ; LE32-NEXT:    stw 8, 16(1)
+; LE32-NEXT:    clrlwi 4, 4, 27
 ; LE32-NEXT:    lwzux 5, 9, 6
-; LE32-NEXT:    li 6, 7
-; LE32-NEXT:    lwz 7, 8(9)
-; LE32-NEXT:    nand 6, 4, 6
-; LE32-NEXT:    lwz 8, 4(9)
-; LE32-NEXT:    clrlwi 4, 4, 29
-; LE32-NEXT:    lwz 9, 12(9)
-; LE32-NEXT:    clrlwi 6, 6, 27
 ; LE32-NEXT:    subfic 11, 4, 32
+; LE32-NEXT:    lwz 6, 8(9)
+; LE32-NEXT:    lwz 7, 4(9)
+; LE32-NEXT:    lwz 8, 12(9)
+; LE32-NEXT:    xori 9, 4, 31
 ; LE32-NEXT:    srw 5, 5, 4
-; LE32-NEXT:    slwi 10, 7, 1
-; LE32-NEXT:    srw 7, 7, 4
-; LE32-NEXT:    slw 6, 10, 6
-; LE32-NEXT:    srw 10, 8, 4
-; LE32-NEXT:    slw 8, 8, 11
-; LE32-NEXT:    slw 11, 9, 11
-; LE32-NEXT:    srw 4, 9, 4
-; LE32-NEXT:    or 5, 8, 5
-; LE32-NEXT:    or 7, 11, 7
-; LE32-NEXT:    or 6, 10, 6
+; LE32-NEXT:    slwi 10, 6, 1
+; LE32-NEXT:    srw 6, 6, 4
+; LE32-NEXT:    slw 9, 10, 9
+; LE32-NEXT:    srw 10, 7, 4
+; LE32-NEXT:    slw 7, 7, 11
+; LE32-NEXT:    slw 11, 8, 11
+; LE32-NEXT:    srw 4, 8, 4
+; LE32-NEXT:    or 5, 7, 5
+; LE32-NEXT:    or 6, 11, 6
+; LE32-NEXT:    or 7, 10, 9
 ; LE32-NEXT:    stw 4, 12(3)
-; LE32-NEXT:    stw 7, 8(3)
+; LE32-NEXT:    stw 6, 8(3)
 ; LE32-NEXT:    stw 5, 0(3)
-; LE32-NEXT:    stw 6, 4(3)
+; LE32-NEXT:    stw 7, 4(3)
 ; LE32-NEXT:    addi 1, 1, 80
 ; LE32-NEXT:    blr
 ;
@@ -89,37 +87,33 @@ define void @pr59074(ptr %0) {
 ; BE32-NEXT:    li 6, 12
 ; BE32-NEXT:    li 7, 0
 ; BE32-NEXT:    addi 8, 1, -48
-; BE32-NEXT:    li 10, 7
 ; BE32-NEXT:    stxvw4x 0, 0, 5
-; BE32-NEXT:    addi 4, 4, -12
 ; BE32-NEXT:    stw 6, -36(1)
+; BE32-NEXT:    addi 4, 4, -12
 ; BE32-NEXT:    stw 7, -40(1)
 ; BE32-NEXT:    stw 7, -44(1)
-; BE32-NEXT:    rlwinm 9, 4, 29, 28, 31
 ; BE32-NEXT:    stw 7, -48(1)
+; BE32-NEXT:    rlwinm 9, 4, 29, 28, 29
+; BE32-NEXT:    clrlwi 4, 4, 27
 ; BE32-NEXT:    sub 5, 8, 9
-; BE32-NEXT:    nand 6, 4, 10
-; BE32-NEXT:    clrlwi 4, 4, 29
-; BE32-NEXT:    clrlwi 6, 6, 27
-; BE32-NEXT:    lwz 7, 4(5)
-; BE32-NEXT:    lwz 8, 8(5)
-; BE32-NEXT:    lwz 9, 0(5)
-; BE32-NEXT:    lwz 5, 12(5)
-; BE32-NEXT:    slwi 10, 7, 1
-; BE32-NEXT:    srw 11, 8, 4
-; BE32-NEXT:    srw 7, 7, 4
-; BE32-NEXT:    srw 5, 5, 4
-; BE32-NEXT:    slw 6, 10, 6
+; BE32-NEXT:    lwz 6, 4(5)
+; BE32-NEXT:    lwz 7, 0(5)
+; BE32-NEXT:    lwz 8, 12(5)
+; BE32-NEXT:    lwz 5, 8(5)
 ; BE32-NEXT:    subfic 10, 4, 32
-; BE32-NEXT:    srw 4, 9, 4
-; BE32-NEXT:    slw 8, 8, 10
-; BE32-NEXT:    slw 10, 9, 10
-; BE32-NEXT:    or 6, 11, 6
-; BE32-NEXT:    or 7, 10, 7
-; BE32-NEXT:    or 5, 8, 5
+; BE32-NEXT:    srw 9, 6, 4
+; BE32-NEXT:    slw 11, 7, 10
+; BE32-NEXT:    srw 8, 8, 4
+; BE32-NEXT:    slw 6, 6, 10
+; BE32-NEXT:    slw 10, 5, 10
+; BE32-NEXT:    srw 5, 5, 4
+; BE32-NEXT:    srw 4, 7, 4
+; BE32-NEXT:    or 7, 11, 9
+; BE32-NEXT:    or 8, 10, 8
+; BE32-NEXT:    or 5, 6, 5
 ; BE32-NEXT:    stw 4, 0(3)
-; BE32-NEXT:    stw 6, 8(3)
-; BE32-NEXT:    stw 5, 12(3)
+; BE32-NEXT:    stw 5, 8(3)
+; BE32-NEXT:    stw 8, 12(3)
 ; BE32-NEXT:    stw 7, 4(3)
 ; BE32-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 044ddf562294c..98c76a7d3887c 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,45 +209,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 6, 28(1)
 ; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
 ; LE-32BIT-NEXT:    stw 6, 16(1)
-; LE-32BIT-NEXT:    addi 6, 1, 32
-; LE-32BIT-NEXT:    stw 7, 32(1)
-; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 28, 31
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    sub 6, 6, 7
+; LE-32BIT-NEXT:    addi 3, 1, 32
 ; LE-32BIT-NEXT:    stw 9, 40(1)
-; LE-32BIT-NEXT:    li 3, 7
+; LE-32BIT-NEXT:    sub 3, 3, 6
 ; LE-32BIT-NEXT:    stw 8, 36(1)
-; LE-32BIT-NEXT:    nand 3, 4, 3
-; LE-32BIT-NEXT:    lwz 7, 4(6)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    lwz 8, 8(6)
-; LE-32BIT-NEXT:    subfic 10, 4, 32
-; LE-32BIT-NEXT:    lwz 9, 0(6)
-; LE-32BIT-NEXT:    clrlwi 3, 3, 27
-; LE-32BIT-NEXT:    lwz 6, 12(6)
-; LE-32BIT-NEXT:    srw 11, 8, 4
-; LE-32BIT-NEXT:    slw 8, 8, 10
-; LE-32BIT-NEXT:    slw 10, 9, 10
-; LE-32BIT-NEXT:    srw 6, 6, 4
-; LE-32BIT-NEXT:    srw 9, 9, 4
-; LE-32BIT-NEXT:    srw 4, 7, 4
-; LE-32BIT-NEXT:    slwi 7, 7, 1
-; LE-32BIT-NEXT:    slw 3, 7, 3
-; LE-32BIT-NEXT:    or 6, 8, 6
-; LE-32BIT-NEXT:    or 4, 10, 4
-; LE-32BIT-NEXT:    or 3, 11, 3
-; LE-32BIT-NEXT:    stw 9, 0(5)
-; LE-32BIT-NEXT:    stw 6, 12(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    or 3, 6, 3
 ; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -304,34 +300,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 40(1)
 ; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    stw 6, 32(1)
-; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 31
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 3, 28(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 16
 ; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    subfic 8, 4, 32
 ; LE-32BIT-NEXT:    stw 7, 16(1)
-; LE-32BIT-NEXT:    li 7, 7
 ; LE-32BIT-NEXT:    lwzux 3, 6, 3
-; LE-32BIT-NEXT:    nand 7, 4, 7
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    subfic 10, 4, 32
-; LE-32BIT-NEXT:    lwz 8, 8(6)
-; LE-32BIT-NEXT:    clrlwi 7, 7, 27
 ; LE-32BIT-NEXT:    lwz 9, 4(6)
 ; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 7, 8(6)
 ; LE-32BIT-NEXT:    lwz 6, 12(6)
 ; LE-32BIT-NEXT:    slw 11, 9, 4
-; LE-32BIT-NEXT:    srw 9, 9, 10
-; LE-32BIT-NEXT:    srw 10, 6, 10
-; LE-32BIT-NEXT:    slw 6, 6, 4
-; LE-32BIT-NEXT:    slw 4, 8, 4
-; LE-32BIT-NEXT:    srwi 8, 8, 1
-; LE-32BIT-NEXT:    srw 7, 8, 7
+; LE-32BIT-NEXT:    srw 9, 9, 8
+; LE-32BIT-NEXT:    srw 10, 7, 8
+; LE-32BIT-NEXT:    srw 8, 6, 8
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
 ; LE-32BIT-NEXT:    or 3, 3, 9
-; LE-32BIT-NEXT:    or 4, 4, 10
+; LE-32BIT-NEXT:    stw 4, 12(5)
+; LE-32BIT-NEXT:    or 4, 7, 8
 ; LE-32BIT-NEXT:    stw 3, 0(5)
-; LE-32BIT-NEXT:    or 3, 11, 7
-; LE-32BIT-NEXT:    stw 6, 12(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
 ; LE-32BIT-NEXT:    stw 4, 8(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
@@ -387,46 +379,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
-; LE-32BIT-NEXT:    li 6, 7
+; LE-32BIT-NEXT:    addi 6, 1, 32
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    stw 8, 36(1)
-; LE-32BIT-NEXT:    rlwinm 8, 4, 29, 28, 31
 ; LE-32BIT-NEXT:    stw 7, 32(1)
-; LE-32BIT-NEXT:    addi 7, 1, 32
+; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 9, 40(1)
-; LE-32BIT-NEXT:    nand 6, 4, 6
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 28(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
 ; LE-32BIT-NEXT:    stw 3, 24(1)
-; LE-32BIT-NEXT:    subfic 10, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 20(1)
-; LE-32BIT-NEXT:    clrlwi 6, 6, 27
 ; LE-32BIT-NEXT:    stw 3, 16(1)
-; LE-32BIT-NEXT:    sub 3, 7, 8
-; LE-32BIT-NEXT:    lwz 7, 4(3)
-; LE-32BIT-NEXT:    lwz 8, 8(3)
-; LE-32BIT-NEXT:    lwz 9, 0(3)
-; LE-32BIT-NEXT:    lwz 3, 12(3)
-; LE-32BIT-NEXT:    srw 11, 8, 4
-; LE-32BIT-NEXT:    slw 8, 8, 10
-; LE-32BIT-NEXT:    slw 10, 9, 10
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
 ; LE-32BIT-NEXT:    srw 3, 3, 4
-; LE-32BIT-NEXT:    sraw 9, 9, 4
-; LE-32BIT-NEXT:    srw 4, 7, 4
-; LE-32BIT-NEXT:    slwi 7, 7, 1
-; LE-32BIT-NEXT:    or 3, 8, 3
-; LE-32BIT-NEXT:    slw 6, 7, 6
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    sraw 4, 7, 4
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 10, 4
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 11, 6
-; LE-32BIT-NEXT:    stw 9, 0(5)
-; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -449,32 +437,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-NEXT:    li 4, 48
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 8
-; LE-64BIT-NEXT:    rlwinm 4, 3, 29, 27, 31
+; LE-64BIT-NEXT:    rlwinm 4, 3, 29, 27, 28
+; LE-64BIT-NEXT:    clrlwi 3, 3, 26
 ; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
 ; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
-; LE-64BIT-NEXT:    li 6, 7
-; LE-64BIT-NEXT:    ldux 7, 4, 7
-; LE-64BIT-NEXT:    ld 8, 16(4)
-; LE-64BIT-NEXT:    nand 6, 3, 6
+; LE-64BIT-NEXT:    xori 8, 3, 63
+; LE-64BIT-NEXT:    ldux 6, 4, 7
+; LE-64BIT-NEXT:    ld 7, 16(4)
 ; LE-64BIT-NEXT:    ld 9, 8(4)
-; LE-64BIT-NEXT:    clrlwi 3, 3, 29
 ; LE-64BIT-NEXT:    ld 4, 24(4)
-; LE-64BIT-NEXT:    clrlwi 6, 6, 26
+; LE-64BIT-NEXT:    srd 6, 6, 3
+; LE-64BIT-NEXT:    sldi 11, 7, 1
+; LE-64BIT-NEXT:    srd 10, 9, 3
 ; LE-64BIT-NEXT:    srd 7, 7, 3
-; LE-64BIT-NEXT:    sldi 10, 8, 1
-; LE-64BIT-NEXT:    srd 11, 9, 3
-; LE-64BIT-NEXT:    srd 8, 8, 3
-; LE-64BIT-NEXT:    sld 6, 10, 6
+; LE-64BIT-NEXT:    sld 8, 11, 8
+; LE-64BIT-NEXT:    or 8, 10, 8
 ; LE-64BIT-NEXT:    subfic 10, 3, 64
 ; LE-64BIT-NEXT:    srd 3, 4, 3
-; LE-64BIT-NEXT:    or 6, 11, 6
 ; LE-64BIT-NEXT:    sld 11, 4, 10
 ; LE-64BIT-NEXT:    sld 9, 9, 10
 ; LE-64BIT-NEXT:    std 3, 24(5)
-; LE-64BIT-NEXT:    or 7, 9, 7
-; LE-64BIT-NEXT:    or 3, 11, 8
-; LE-64BIT-NEXT:    std 6, 8(5)
-; LE-64BIT-NEXT:    std 7, 0(5)
+; LE-64BIT-NEXT:    std 8, 8(5)
+; LE-64BIT-NEXT:    or 6, 9, 6
+; LE-64BIT-NEXT:    or 3, 11, 7
+; LE-64BIT-NEXT:    std 6, 0(5)
 ; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
@@ -485,44 +471,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    addi 11, 1, -32
-; BE-NEXT:    std 3, 56(9)
-; BE-NEXT:    rlwinm 3, 4, 29, 27, 31
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -32
+; BE-NEXT:    std 9, -40(1)
+; BE-NEXT:    std 9, -48(1)
+; BE-NEXT:    std 9, -56(1)
+; BE-NEXT:    std 9, -64(1)
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    rlwinm 3, 4, 29, 27, 28
 ; BE-NEXT:    neg 3, 3
-; BE-NEXT:    std 10, 24(9)
-; BE-NEXT:    std 10, 16(9)
-; BE-NEXT:    std 10, 8(9)
-; BE-NEXT:    std 10, -64(1)
-; BE-NEXT:    std 8, 48(9)
-; BE-NEXT:    std 7, 40(9)
-; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    std 8, -16(1)
+; BE-NEXT:    std 7, -24(1)
+; BE-NEXT:    std 6, -32(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    ldux 3, 11, 3
-; BE-NEXT:    li 6, 7
-; BE-NEXT:    nand 6, 4, 6
-; BE-NEXT:    clrlwi 4, 4, 29
-; BE-NEXT:    clrlwi 6, 6, 26
-; BE-NEXT:    ld 7, 8(11)
-; BE-NEXT:    ld 8, 16(11)
-; BE-NEXT:    ld 9, 24(11)
-; BE-NEXT:    subfic 10, 4, 64
-; BE-NEXT:    sldi 11, 7, 1
-; BE-NEXT:    srd 7, 7, 4
-; BE-NEXT:    srd 9, 9, 4
-; BE-NEXT:    sld 6, 11, 6
-; BE-NEXT:    sld 11, 3, 10
-; BE-NEXT:    sld 10, 8, 10
-; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    ldux 3, 10, 3
+; BE-NEXT:    clrlwi 4, 4, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 6, 8(10)
+; BE-NEXT:    ld 7, 24(10)
+; BE-NEXT:    ld 8, 16(10)
+; BE-NEXT:    sld 10, 3, 9
 ; BE-NEXT:    srd 3, 3, 4
-; BE-NEXT:    or 7, 11, 7
-; BE-NEXT:    or 6, 8, 6
-; BE-NEXT:    or 8, 10, 9
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 8, 24(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    srd 11, 6, 4
+; BE-NEXT:    srd 7, 7, 4
+; BE-NEXT:    sld 6, 6, 9
+; BE-NEXT:    sld 9, 8, 9
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 7, 9, 7
+; BE-NEXT:    or 6, 6, 8
 ; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: lshr_32bytes:
@@ -546,68 +527,64 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 28(1)
 ; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
-; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 31
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 29
 ; LE-32BIT-NEXT:    stw 3, 80(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 52
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    sub 3, 3, 6
 ; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 30, 4, 32
 ; LE-32BIT-NEXT:    stw 0, 76(1)
 ; LE-32BIT-NEXT:    stw 12, 72(1)
+; LE-32BIT-NEXT:    xori 12, 4, 31
 ; LE-32BIT-NEXT:    stw 11, 68(1)
 ; LE-32BIT-NEXT:    stw 10, 64(1)
 ; LE-32BIT-NEXT:    stw 9, 60(1)
-; LE-32BIT-NEXT:    li 9, 7
 ; LE-32BIT-NEXT:    stw 8, 56(1)
-; LE-32BIT-NEXT:    nand 9, 4, 9
 ; LE-32BIT-NEXT:    stw 7, 52(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    lwz 6, 4(3)
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    clrlwi 9, 9, 27
-; LE-32BIT-NEXT:    lwz 8, 12(3)
-; LE-32BIT-NEXT:    slwi 29, 6, 1
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    srw 28, 7, 4
-; LE-32BIT-NEXT:    lwz 11, 20(3)
-; LE-32BIT-NEXT:    slwi 27, 8, 1
-; LE-32BIT-NEXT:    lwz 12, 24(3)
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
+; LE-32BIT-NEXT:    lwz 8, 0(3)
+; LE-32BIT-NEXT:    srw 29, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 12(3)
+; LE-32BIT-NEXT:    slw 6, 6, 30
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    slw 28, 8, 30
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    srw 27, 9, 4
+; LE-32BIT-NEXT:    lwz 0, 28(3)
 ; LE-32BIT-NEXT:    srw 26, 10, 4
-; LE-32BIT-NEXT:    lwz 0, 0(3)
-; LE-32BIT-NEXT:    srw 6, 6, 4
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    srw 25, 12, 4
-; LE-32BIT-NEXT:    slw 12, 12, 30
-; LE-32BIT-NEXT:    slw 7, 7, 30
-; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    slw 25, 11, 30
+; LE-32BIT-NEXT:    slw 9, 9, 30
 ; LE-32BIT-NEXT:    slw 10, 10, 30
-; LE-32BIT-NEXT:    slw 30, 0, 30
-; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 30, 3, 30
+; LE-32BIT-NEXT:    srw 3, 3, 4
 ; LE-32BIT-NEXT:    srw 0, 0, 4
-; LE-32BIT-NEXT:    srw 4, 11, 4
-; LE-32BIT-NEXT:    or 3, 12, 3
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 30, 0
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    or 3, 10, 4
-; LE-32BIT-NEXT:    slwi 11, 11, 1
+; LE-32BIT-NEXT:    or 3, 9, 11
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    slwi 7, 7, 1
 ; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 7, 8
-; LE-32BIT-NEXT:    slw 29, 29, 9
-; LE-32BIT-NEXT:    slw 27, 27, 9
-; LE-32BIT-NEXT:    slw 9, 11, 9
+; LE-32BIT-NEXT:    or 3, 6, 27
+; LE-32BIT-NEXT:    slw 7, 7, 12
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 30, 6
+; LE-32BIT-NEXT:    or 3, 28, 4
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 25, 9
-; LE-32BIT-NEXT:    stw 3, 24(5)
-; LE-32BIT-NEXT:    or 3, 26, 27
-; LE-32BIT-NEXT:    stw 3, 16(5)
-; LE-32BIT-NEXT:    or 3, 28, 29
-; LE-32BIT-NEXT:    stw 0, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 7
+; LE-32BIT-NEXT:    stw 8, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
@@ -635,37 +612,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 6
 ; LE-64BIT-NEXT:    li 6, 48
-; LE-64BIT-NEXT:    rlwinm 3, 4, 29, 27, 31
+; LE-64BIT-NEXT:    rlwinm 3, 4, 29, 27, 28
+; LE-64BIT-NEXT:    clrlwi 4, 4, 26
 ; LE-64BIT-NEXT:    neg 3, 3
 ; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
 ; LE-64BIT-NEXT:    li 6, 32
 ; LE-64BIT-NEXT:    extsw 3, 3
 ; LE-64BIT-NEXT:    stxvd2x 1, 7, 6
 ; LE-64BIT-NEXT:    stxvd2x 2, 0, 7
-; LE-64BIT-NEXT:    li 6, 7
+; LE-64BIT-NEXT:    subfic 6, 4, 64
 ; LE-64BIT-NEXT:    ldux 3, 8, 3
-; LE-64BIT-NEXT:    ld 7, 8(8)
-; LE-64BIT-NEXT:    nand 6, 4, 6
-; LE-64BIT-NEXT:    ld 9, 16(8)
-; LE-64BIT-NEXT:    clrlwi 4, 4, 29
-; LE-64BIT-NEXT:    ld 8, 24(8)
-; LE-64BIT-NEXT:    clrlwi 6, 6, 26
-; LE-64BIT-NEXT:    rldicl 10, 7, 63, 1
-; LE-64BIT-NEXT:    sld 8, 8, 4
+; LE-64BIT-NEXT:    ld 7, 16(8)
+; LE-64BIT-NEXT:    ld 9, 24(8)
+; LE-64BIT-NEXT:    ld 8, 8(8)
+; LE-64BIT-NEXT:    srd 10, 7, 6
+; LE-64BIT-NEXT:    sld 9, 9, 4
 ; LE-64BIT-NEXT:    sld 7, 7, 4
-; LE-64BIT-NEXT:    srd 6, 10, 6
-; LE-64BIT-NEXT:    sld 10, 9, 4
-; LE-64BIT-NEXT:    or 6, 10, 6
-; LE-64BIT-NEXT:    subfic 10, 4, 64
-; LE-64BIT-NEXT:    srd 9, 9, 10
-; LE-64BIT-NEXT:    srd 10, 3, 10
+; LE-64BIT-NEXT:    or 9, 9, 10
+; LE-64BIT-NEXT:    srd 10, 8, 6
+; LE-64BIT-NEXT:    srd 6, 3, 6
+; LE-64BIT-NEXT:    sld 8, 8, 4
 ; LE-64BIT-NEXT:    sld 3, 3, 4
-; LE-64BIT-NEXT:    std 6, 16(5)
-; LE-64BIT-NEXT:    or 7, 7, 10
+; LE-64BIT-NEXT:    or 6, 8, 6
 ; LE-64BIT-NEXT:    std 3, 0(5)
-; LE-64BIT-NEXT:    or 3, 8, 9
-; LE-64BIT-NEXT:    std 7, 8(5)
-; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 7, 10
+; LE-64BIT-NEXT:    std 9, 24(5)
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: shl_32bytes:
@@ -675,41 +648,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    std 10, 56(9)
-; BE-NEXT:    std 10, 48(9)
-; BE-NEXT:    std 10, 40(9)
-; BE-NEXT:    std 10, 32(9)
-; BE-NEXT:    std 3, 24(9)
-; BE-NEXT:    std 8, 16(9)
-; BE-NEXT:    std 7, 8(9)
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -64
+; BE-NEXT:    std 9, -8(1)
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 9, -24(1)
+; BE-NEXT:    std 9, -32(1)
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 8, -48(1)
+; BE-NEXT:    std 7, -56(1)
 ; BE-NEXT:    std 6, -64(1)
-; BE-NEXT:    rlwinm 3, 4, 29, 27, 31
-; BE-NEXT:    ldux 6, 3, 9
-; BE-NEXT:    li 7, 7
-; BE-NEXT:    nand 7, 4, 7
-; BE-NEXT:    clrlwi 4, 4, 29
-; BE-NEXT:    clrlwi 7, 7, 26
-; BE-NEXT:    ld 8, 16(3)
-; BE-NEXT:    ld 9, 8(3)
+; BE-NEXT:    rlwinm 3, 4, 29, 27, 28
+; BE-NEXT:    ldux 6, 3, 10
+; BE-NEXT:    clrlwi 4, 4, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 16(3)
+; BE-NEXT:    ld 8, 8(3)
 ; BE-NEXT:    ld 3, 24(3)
-; BE-NEXT:    subfic 10, 4, 64
 ; BE-NEXT:    sld 6, 6, 4
-; BE-NEXT:    rldicl 11, 8, 63, 1
-; BE-NEXT:    sld 8, 8, 4
-; BE-NEXT:    srd 7, 11, 7
-; BE-NEXT:    srd 11, 9, 10
-; BE-NEXT:    sld 9, 9, 4
-; BE-NEXT:    srd 10, 3, 10
+; BE-NEXT:    srd 10, 7, 9
+; BE-NEXT:    sld 11, 8, 4
+; BE-NEXT:    srd 8, 8, 9
+; BE-NEXT:    srd 9, 3, 9
+; BE-NEXT:    sld 7, 7, 4
 ; BE-NEXT:    sld 3, 3, 4
-; BE-NEXT:    or 6, 6, 11
-; BE-NEXT:    or 7, 9, 7
-; BE-NEXT:    or 8, 8, 10
+; BE-NEXT:    or 10, 11, 10
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    or 7, 7, 9
 ; BE-NEXT:    std 3, 24(5)
-; BE-NEXT:    std 8, 16(5)
+; BE-NEXT:    std 7, 16(5)
 ; BE-NEXT:    std 6, 0(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: shl_32bytes:
@@ -739,61 +708,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 60(1)
 ; LE-32BIT-NEXT:    stw 6, 56(1)
 ; LE-32BIT-NEXT:    stw 6, 52(1)
-; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 31
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 29
 ; LE-32BIT-NEXT:    stw 3, 48(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 20
 ; LE-32BIT-NEXT:    stw 0, 44(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 12, 40(1)
+; LE-32BIT-NEXT:    subfic 12, 4, 32
 ; LE-32BIT-NEXT:    stw 11, 36(1)
 ; LE-32BIT-NEXT:    stw 10, 32(1)
 ; LE-32BIT-NEXT:    stw 9, 28(1)
 ; LE-32BIT-NEXT:    stw 8, 24(1)
-; LE-32BIT-NEXT:    li 8, 7
 ; LE-32BIT-NEXT:    stw 7, 20(1)
-; LE-32BIT-NEXT:    nand 8, 4, 8
 ; LE-32BIT-NEXT:    lwzux 3, 6, 3
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    subfic 0, 4, 32
-; LE-32BIT-NEXT:    clrlwi 8, 8, 27
 ; LE-32BIT-NEXT:    lwz 7, 8(6)
 ; LE-32BIT-NEXT:    slw 3, 3, 4
-; LE-32BIT-NEXT:    lwz 9, 4(6)
-; LE-32BIT-NEXT:    lwz 10, 16(6)
-; LE-32BIT-NEXT:    srwi 29, 7, 1
-; LE-32BIT-NEXT:    lwz 11, 12(6)
-; LE-32BIT-NEXT:    slw 28, 9, 4
-; LE-32BIT-NEXT:    lwz 12, 24(6)
-; LE-32BIT-NEXT:    srwi 27, 10, 1
-; LE-32BIT-NEXT:    lwz 30, 20(6)
-; LE-32BIT-NEXT:    slw 26, 11, 4
+; LE-32BIT-NEXT:    lwz 8, 4(6)
+; LE-32BIT-NEXT:    lwz 9, 16(6)
+; LE-32BIT-NEXT:    srw 30, 7, 12
+; LE-32BIT-NEXT:    lwz 10, 12(6)
+; LE-32BIT-NEXT:    slw 29, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 24(6)
+; LE-32BIT-NEXT:    srw 8, 8, 12
+; LE-32BIT-NEXT:    lwz 0, 20(6)
+; LE-32BIT-NEXT:    srw 28, 9, 12
 ; LE-32BIT-NEXT:    lwz 6, 28(6)
-; LE-32BIT-NEXT:    srw 9, 9, 0
-; LE-32BIT-NEXT:    slw 25, 30, 4
-; LE-32BIT-NEXT:    srw 11, 11, 0
+; LE-32BIT-NEXT:    slw 27, 10, 4
+; LE-32BIT-NEXT:    srw 10, 10, 12
 ; LE-32BIT-NEXT:    slw 7, 7, 4
-; LE-32BIT-NEXT:    srw 30, 30, 0
-; LE-32BIT-NEXT:    slw 10, 10, 4
-; LE-32BIT-NEXT:    srw 0, 6, 0
-; LE-32BIT-NEXT:    slw 6, 6, 4
-; LE-32BIT-NEXT:    slw 4, 12, 4
-; LE-32BIT-NEXT:    srwi 12, 12, 1
-; LE-32BIT-NEXT:    srw 29, 29, 8
-; LE-32BIT-NEXT:    srw 27, 27, 8
-; LE-32BIT-NEXT:    srw 8, 12, 8
-; LE-32BIT-NEXT:    or 3, 3, 9
-; LE-32BIT-NEXT:    or 4, 4, 0
-; LE-32BIT-NEXT:    stw 3, 0(5)
-; LE-32BIT-NEXT:    or 3, 25, 8
+; LE-32BIT-NEXT:    srw 26, 11, 12
+; LE-32BIT-NEXT:    slw 25, 0, 4
+; LE-32BIT-NEXT:    srw 0, 0, 12
+; LE-32BIT-NEXT:    slw 9, 9, 4
+; LE-32BIT-NEXT:    srw 12, 6, 12
+; LE-32BIT-NEXT:    slw 11, 11, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    stw 4, 28(5)
+; LE-32BIT-NEXT:    or 4, 11, 12
 ; LE-32BIT-NEXT:    stw 4, 24(5)
-; LE-32BIT-NEXT:    or 4, 10, 30
-; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 26, 27
+; LE-32BIT-NEXT:    or 4, 9, 0
 ; LE-32BIT-NEXT:    stw 4, 16(5)
-; LE-32BIT-NEXT:    or 4, 7, 11
-; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 28, 29
-; LE-32BIT-NEXT:    stw 6, 28(5)
+; LE-32BIT-NEXT:    or 4, 25, 26
+; LE-32BIT-NEXT:    stw 4, 20(5)
+; LE-32BIT-NEXT:    or 4, 7, 10
+; LE-32BIT-NEXT:    or 3, 3, 8
 ; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    or 4, 27, 28
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 12(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
@@ -812,91 +775,84 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_32bytes:
 ; LE-64BIT:       # %bb.0:
-; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
 ; LE-64BIT-NEXT:    ld 6, 24(3)
+; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
 ; LE-64BIT-NEXT:    addi 7, 1, -64
 ; LE-64BIT-NEXT:    ld 3, 16(3)
 ; LE-64BIT-NEXT:    sradi 8, 6, 63
-; LE-64BIT-NEXT:    rlwinm 9, 4, 29, 27, 31
-; LE-64BIT-NEXT:    std 6, 24(7)
-; LE-64BIT-NEXT:    std 3, 16(7)
-; LE-64BIT-NEXT:    li 3, 7
-; LE-64BIT-NEXT:    std 8, 56(7)
-; LE-64BIT-NEXT:    std 8, 48(7)
-; LE-64BIT-NEXT:    std 8, 40(7)
-; LE-64BIT-NEXT:    std 8, 32(7)
+; LE-64BIT-NEXT:    rlwinm 9, 4, 29, 27, 28
+; LE-64BIT-NEXT:    clrlwi 4, 4, 26
 ; LE-64BIT-NEXT:    stxvd2x 0, 0, 7
-; LE-64BIT-NEXT:    nand 3, 4, 3
-; LE-64BIT-NEXT:    clrlwi 4, 4, 29
-; LE-64BIT-NEXT:    ldux 6, 9, 7
-; LE-64BIT-NEXT:    ld 7, 16(9)
+; LE-64BIT-NEXT:    std 6, -40(1)
+; LE-64BIT-NEXT:    std 3, -48(1)
+; LE-64BIT-NEXT:    std 8, -8(1)
+; LE-64BIT-NEXT:    std 8, -16(1)
+; LE-64BIT-NEXT:    std 8, -24(1)
+; LE-64BIT-NEXT:    std 8, -32(1)
+; LE-64BIT-NEXT:    ldux 3, 9, 7
+; LE-64BIT-NEXT:    xori 7, 4, 63
+; LE-64BIT-NEXT:    ld 6, 16(9)
 ; LE-64BIT-NEXT:    ld 8, 8(9)
-; LE-64BIT-NEXT:    clrlwi 3, 3, 26
 ; LE-64BIT-NEXT:    ld 9, 24(9)
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    sldi 11, 6, 1
+; LE-64BIT-NEXT:    srd 10, 8, 4
 ; LE-64BIT-NEXT:    srd 6, 6, 4
-; LE-64BIT-NEXT:    sldi 10, 7, 1
-; LE-64BIT-NEXT:    srd 11, 8, 4
-; LE-64BIT-NEXT:    srd 7, 7, 4
-; LE-64BIT-NEXT:    sld 3, 10, 3
+; LE-64BIT-NEXT:    sld 7, 11, 7
+; LE-64BIT-NEXT:    or 7, 10, 7
 ; LE-64BIT-NEXT:    subfic 10, 4, 64
 ; LE-64BIT-NEXT:    srad 4, 9, 4
-; LE-64BIT-NEXT:    or 3, 11, 3
-; LE-64BIT-NEXT:    sld 11, 9, 10
 ; LE-64BIT-NEXT:    sld 8, 8, 10
+; LE-64BIT-NEXT:    sld 11, 9, 10
 ; LE-64BIT-NEXT:    std 4, 24(5)
-; LE-64BIT-NEXT:    or 6, 8, 6
-; LE-64BIT-NEXT:    or 4, 11, 7
-; LE-64BIT-NEXT:    std 3, 8(5)
-; LE-64BIT-NEXT:    std 6, 0(5)
-; LE-64BIT-NEXT:    std 4, 16(5)
+; LE-64BIT-NEXT:    std 7, 8(5)
+; LE-64BIT-NEXT:    or 3, 8, 3
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    or 3, 11, 6
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: ashr_32bytes:
 ; BE:       # %bb.0:
-; BE-NEXT:    ld 6, 0(3)
-; BE-NEXT:    ld 7, 8(3)
-; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    addi 10, 1, -32
-; BE-NEXT:    std 3, 56(9)
-; BE-NEXT:    std 6, 32(9)
-; BE-NEXT:    sradi 3, 6, 63
-; BE-NEXT:    rlwinm 6, 4, 29, 27, 31
-; BE-NEXT:    std 3, 24(9)
-; BE-NEXT:    std 3, 16(9)
-; BE-NEXT:    std 3, 8(9)
+; BE-NEXT:    addi 6, 1, -32
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    std 7, -32(1)
+; BE-NEXT:    sradi 3, 7, 63
+; BE-NEXT:    rlwinm 7, 4, 29, 27, 28
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 3, -48(1)
+; BE-NEXT:    std 3, -56(1)
 ; BE-NEXT:    std 3, -64(1)
-; BE-NEXT:    neg 3, 6
-; BE-NEXT:    std 8, 48(9)
-; BE-NEXT:    std 7, 40(9)
+; BE-NEXT:    neg 3, 7
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    ldux 3, 10, 3
-; BE-NEXT:    li 6, 7
-; BE-NEXT:    nand 6, 4, 6
-; BE-NEXT:    clrlwi 4, 4, 29
-; BE-NEXT:    clrlwi 6, 6, 26
-; BE-NEXT:    ld 7, 8(10)
-; BE-NEXT:    ld 8, 16(10)
-; BE-NEXT:    ld 9, 24(10)
-; BE-NEXT:    subfic 10, 4, 64
-; BE-NEXT:    sldi 11, 7, 1
-; BE-NEXT:    srd 7, 7, 4
-; BE-NEXT:    srd 9, 9, 4
-; BE-NEXT:    sld 6, 11, 6
-; BE-NEXT:    sld 11, 3, 10
-; BE-NEXT:    sld 10, 8, 10
-; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    ldux 3, 6, 3
+; BE-NEXT:    clrlwi 4, 4, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 8(6)
+; BE-NEXT:    ld 8, 24(6)
+; BE-NEXT:    ld 6, 16(6)
+; BE-NEXT:    sld 10, 3, 9
 ; BE-NEXT:    srad 3, 3, 4
-; BE-NEXT:    or 7, 11, 7
-; BE-NEXT:    or 6, 8, 6
-; BE-NEXT:    or 8, 10, 9
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 8, 24(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    srd 11, 7, 4
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    sld 7, 7, 9
+; BE-NEXT:    sld 9, 6, 9
+; BE-NEXT:    srd 6, 6, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 8, 9, 8
+; BE-NEXT:    or 6, 7, 6
 ; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 8, 24(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: ashr_32bytes:
@@ -915,73 +871,69 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 3, 80(1)
 ; LE-32BIT-NEXT:    srawi 3, 7, 31
 ; LE-32BIT-NEXT:    stw 7, 52(1)
-; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 27, 31
+; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 27, 29
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 30, 4, 32
 ; LE-32BIT-NEXT:    stw 0, 76(1)
 ; LE-32BIT-NEXT:    stw 12, 72(1)
+; LE-32BIT-NEXT:    xori 12, 4, 31
 ; LE-32BIT-NEXT:    stw 11, 68(1)
 ; LE-32BIT-NEXT:    stw 10, 64(1)
 ; LE-32BIT-NEXT:    stw 9, 60(1)
-; LE-32BIT-NEXT:    li 9, 7
 ; LE-32BIT-NEXT:    stw 8, 56(1)
-; LE-32BIT-NEXT:    nand 9, 4, 9
 ; LE-32BIT-NEXT:    stw 3, 48(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    subfic 30, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 40(1)
-; LE-32BIT-NEXT:    clrlwi 9, 9, 27
 ; LE-32BIT-NEXT:    stw 3, 36(1)
 ; LE-32BIT-NEXT:    stw 3, 32(1)
 ; LE-32BIT-NEXT:    stw 3, 28(1)
 ; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 20(1)
 ; LE-32BIT-NEXT:    sub 3, 6, 7
-; LE-32BIT-NEXT:    lwz 6, 4(3)
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    lwz 8, 12(3)
-; LE-32BIT-NEXT:    slwi 29, 6, 1
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    srw 28, 7, 4
-; LE-32BIT-NEXT:    lwz 11, 20(3)
-; LE-32BIT-NEXT:    slwi 27, 8, 1
-; LE-32BIT-NEXT:    lwz 12, 24(3)
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
+; LE-32BIT-NEXT:    lwz 8, 0(3)
+; LE-32BIT-NEXT:    srw 29, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 12(3)
+; LE-32BIT-NEXT:    slw 6, 6, 30
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    slw 28, 8, 30
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    srw 27, 9, 4
+; LE-32BIT-NEXT:    lwz 0, 28(3)
 ; LE-32BIT-NEXT:    srw 26, 10, 4
-; LE-32BIT-NEXT:    lwz 0, 0(3)
-; LE-32BIT-NEXT:    srw 6, 6, 4
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    srw 25, 12, 4
-; LE-32BIT-NEXT:    slw 12, 12, 30
-; LE-32BIT-NEXT:    slw 7, 7, 30
-; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    slw 25, 11, 30
+; LE-32BIT-NEXT:    slw 9, 9, 30
 ; LE-32BIT-NEXT:    slw 10, 10, 30
-; LE-32BIT-NEXT:    slw 30, 0, 30
-; LE-32BIT-NEXT:    srw 8, 8, 4
-; LE-32BIT-NEXT:    sraw 0, 0, 4
-; LE-32BIT-NEXT:    srw 4, 11, 4
-; LE-32BIT-NEXT:    or 3, 12, 3
+; LE-32BIT-NEXT:    slw 30, 3, 30
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    srw 0, 0, 4
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 30, 0
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    or 3, 10, 4
-; LE-32BIT-NEXT:    slwi 11, 11, 1
+; LE-32BIT-NEXT:    or 3, 9, 11
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    sraw 8, 8, 4
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    slwi 7, 7, 1
 ; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 7, 8
-; LE-32BIT-NEXT:    slw 29, 29, 9
-; LE-32BIT-NEXT:    slw 27, 27, 9
-; LE-32BIT-NEXT:    slw 9, 11, 9
+; LE-32BIT-NEXT:    or 3, 6, 27
+; LE-32BIT-NEXT:    slw 7, 7, 12
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 30, 6
+; LE-32BIT-NEXT:    or 3, 28, 4
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 25, 9
-; LE-32BIT-NEXT:    stw 3, 24(5)
-; LE-32BIT-NEXT:    or 3, 26, 27
-; LE-32BIT-NEXT:    stw 3, 16(5)
-; LE-32BIT-NEXT:    or 3, 28, 29
-; LE-32BIT-NEXT:    stw 0, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 7
+; LE-32BIT-NEXT:    stw 8, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1c303de55c95d..54106bde42527 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -177,7 +177,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $156, %esp
+; X86-NEXT:    subl $152, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -194,48 +194,47 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    xorl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    bsrl %esi, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %ebp, %edx
 ; X86-NEXT:    xorl $31, %edx
@@ -247,12 +246,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    cmovnel %edx, %edi
 ; X86-NEXT:    addl $64, %edi
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -261,7 +260,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -270,299 +269,293 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %edi, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %edx
-; X86-NEXT:    cmovnel %ebx, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %ecx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.8: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    jne .LBB4_8
-; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl $127, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    je .LBB4_8
-; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    xorb $127, %al
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 148(%esp,%edi), %edx
-; X86-NEXT:    movl 152(%esp,%edi), %esi
-; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 144(%esp,%eax), %edx
+; X86-NEXT:    movl 148(%esp,%eax), %esi
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 144(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    shrl %ebp
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    movl 140(%esp,%edi), %edx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    jae .LBB4_3
+; X86-NEXT:    movl 136(%esp,%eax), %esi
+; X86-NEXT:    movl 140(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_9
+; X86-NEXT:  .LBB4_2: # %udiv-preheader
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %dl, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    movl 104(%esp,%edx), %ebx
-; X86-NEXT:    movl 100(%esp,%edx), %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %ebp
-; X86-NEXT:    movl 92(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 100(%esp,%edx), %ebx
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl 96(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%edx), %eax
+; X86-NEXT:    movl 92(%esp,%edx), %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    shrdl %cl, %esi, %ebp
 ; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_4: # %udiv-do-while
+; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    shldl $1, %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %ecx
 ; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    subl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl $-1, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    jne .LBB4_4
-; X86-NEXT:  # %bb.5:
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.4:
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebp, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    shldl $1, %eax, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:  .LBB4_9: # %udiv-end
+; X86-NEXT:    xorl %ebp, %eax
+; X86-NEXT:    xorl %ebp, %edx
+; X86-NEXT:    xorl %ebp, %ecx
+; X86-NEXT:    xorl %ebp, %esi
+; X86-NEXT:    subl %ebp, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    sbbl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %ebp, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, (%edi)
+; X86-NEXT:    movl %ecx, 4(%edi)
+; X86-NEXT:    movl %edx, 8(%edi)
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    imull %ebp, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edx, %esi
@@ -572,7 +565,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -584,7 +577,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $156, %esp
+; X86-NEXT:    addl $152, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index fa45afbb634c4..84f35c6485abe 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -177,14 +177,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $136, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    subl $132, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sete %bl
@@ -195,95 +195,97 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %bl, %al
 ; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl %edi, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebp, %ebp
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    addl $32, %ebp
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ebp
 ; X86-NEXT:    addl $64, %ebp
 ; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    bsrl %esi, %edx
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %edi, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    subl %edx, %ebp
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    cmpl %ebp, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb (%esp), %cl # 1-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    cmovnel %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovnel %edx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    cmovnel %edx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmovnel %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovnel %edi, %ebx
+; X86-NEXT:    cmovnel %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    cmovnel %edi, %edx
 ; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    jne .LBB4_7
 ; X86-NEXT:  # %bb.1: # %udiv-bb1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -291,89 +293,78 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    xorb $127, %al
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 128(%esp,%eax), %edx
-; X86-NEXT:    movl 132(%esp,%eax), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    notb %cl
 ; X86-NEXT:    movl 124(%esp,%eax), %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %ebp, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 116(%esp,%eax), %edx
 ; X86-NEXT:    movl 120(%esp,%eax), %eax
-; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %eax, %ebp
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    shldl %cl, %edx, %ebp
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_6
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 80(%esp,%eax), %esi
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 76(%esp,%eax), %ebp
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    shrdl %cl, %esi, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %ebp
-; X86-NEXT:    movl 76(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shrdl %cl, %edx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%esp,%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esp,%eax), %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -383,148 +374,145 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    orl %ebp, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    shldl $1, %ebp, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    shldl $1, %ebp, %ebx
 ; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:  .LBB4_7: # %udiv-end
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %ecx, (%esi)
+; X86-NEXT:    movl %ebp, 4(%esi)
+; X86-NEXT:    movl %ebx, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %ebp, %esi
 ; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull %ebp, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -532,11 +520,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -546,7 +534,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $136, %esp
+; X86-NEXT:    addl $132, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index fbc363f77ec42..3dbd0213293bb 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,28 +23,28 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    movzbl (%eax), %ecx
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    divb %cl
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    shll $30, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $30, %ecx
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    shrdl $1, %eax, %ecx
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    andl $3, %edx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %BB_udiv-special-cases
 ; X86-NEXT:    bsrl %esi, %eax
@@ -52,18 +52,19 @@ define void @f() nounwind {
 ; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    bsrl %edx, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB0_3: # %BB_udiv-special-cases
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_4
 ; X86-NEXT:  # %bb.5: # %BB_udiv-special-cases
 ; X86-NEXT:    addl $64, %eax
 ; X86-NEXT:    jmp .LBB0_6
 ; X86-NEXT:  .LBB0_4:
-; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    bsrl %edi, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:  .LBB0_6: # %BB_udiv-special-cases
@@ -82,7 +83,6 @@ define void @f() nounwind {
 ; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl $65, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    je .LBB0_11
@@ -92,17 +92,16 @@ define void @f() nounwind {
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    andl $3, %ebx
 ; X86-NEXT:    movb $65, %cl
 ; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    negb %cl
-; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %esi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -111,31 +110,22 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%esp,%eax), %edi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 112(%esp,%eax), %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl 112(%esp,%esi), %eax
+; X86-NEXT:    movl 116(%esp,%esi), %edx
+; X86-NEXT:    movl 120(%esp,%esi), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB0_11
 ; X86-NEXT:  # %bb.9: # %udiv-preheader
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -144,25 +134,18 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 72(%esp,%eax), %edx
 ; X86-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NEXT:    movl 68(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 72(%esp,%eax), %ebx
-; X86-NEXT:    addl %ebx, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl 68(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -180,60 +163,59 @@ define void @f() nounwind {
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB0_10: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl $1, %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    andl $2, %edx
 ; X86-NEXT:    shrl %edx
 ; X86-NEXT:    leal (%edx,%ebx,2), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    shll $30, %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sarl $30, %edx
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    shrdl $1, %esi, %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    shll $30, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sarl $30, %edi
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    shrdl $1, %edx, %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $3, %esi
-; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $3, %edi
+; X86-NEXT:    andl $3, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    jne .LBB0_10
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 785b97d8c2402..b2ff06798aad7 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -14,6 +14,7 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    movq %rdi, %rax
 ; ILP-NEXT:    leal (%rsi,%rsi), %ecx
+; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -22,10 +23,9 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movl %ecx, %edx
 ; ILP-NEXT:    shrb $3, %dl
-; ILP-NEXT:    andb $7, %cl
+; ILP-NEXT:    andb $24, %dl
 ; ILP-NEXT:    negb %dl
 ; ILP-NEXT:    movsbq %dl, %rdx
 ; ILP-NEXT:    movq -16(%rsp,%rdx), %rsi
@@ -60,13 +60,13 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    addl %esi, %esi
-; HYBRID-NEXT:    addb $3, %sil
-; HYBRID-NEXT:    movl %esi, %ecx
-; HYBRID-NEXT:    andb $7, %cl
-; HYBRID-NEXT:    shrb $3, %sil
-; HYBRID-NEXT:    negb %sil
-; HYBRID-NEXT:    movsbq %sil, %rdx
+; HYBRID-NEXT:    leal (%rsi,%rsi), %ecx
+; HYBRID-NEXT:    addb $3, %cl
+; HYBRID-NEXT:    movl %ecx, %edx
+; HYBRID-NEXT:    shrb $3, %dl
+; HYBRID-NEXT:    andb $24, %dl
+; HYBRID-NEXT:    negb %dl
+; HYBRID-NEXT:    movsbq %dl, %rdx
 ; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rsi
 ; HYBRID-NEXT:    movq -8(%rsp,%rdx), %rdi
 ; HYBRID-NEXT:    shldq %cl, %rsi, %rdi
@@ -81,6 +81,7 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    shlq %cl, %rsi
 ; HYBRID-NEXT:    notb %cl
 ; HYBRID-NEXT:    shrq %rdx
+; HYBRID-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; HYBRID-NEXT:    shrq %cl, %rdx
 ; HYBRID-NEXT:    orq %rsi, %rdx
 ; HYBRID-NEXT:    movq %rdx, 16(%rax)
@@ -97,13 +98,13 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    addl %esi, %esi
-; BURR-NEXT:    addb $3, %sil
-; BURR-NEXT:    movl %esi, %ecx
-; BURR-NEXT:    andb $7, %cl
-; BURR-NEXT:    shrb $3, %sil
-; BURR-NEXT:    negb %sil
-; BURR-NEXT:    movsbq %sil, %rdx
+; BURR-NEXT:    leal (%rsi,%rsi), %ecx
+; BURR-NEXT:    addb $3, %cl
+; BURR-NEXT:    movl %ecx, %edx
+; BURR-NEXT:    shrb $3, %dl
+; BURR-NEXT:    andb $24, %dl
+; BURR-NEXT:    negb %dl
+; BURR-NEXT:    movsbq %dl, %rdx
 ; BURR-NEXT:    movq -16(%rsp,%rdx), %rsi
 ; BURR-NEXT:    movq -8(%rsp,%rdx), %rdi
 ; BURR-NEXT:    shldq %cl, %rsi, %rdi
@@ -118,6 +119,7 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    shlq %cl, %rsi
 ; BURR-NEXT:    notb %cl
 ; BURR-NEXT:    shrq %rdx
+; BURR-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; BURR-NEXT:    shrq %cl, %rdx
 ; BURR-NEXT:    orq %rsi, %rdx
 ; BURR-NEXT:    movq %rdx, 16(%rax)
@@ -126,8 +128,8 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-LABEL: test1:
 ; SRC:       # %bb.0:
 ; SRC-NEXT:    movq %rdi, %rax
-; SRC-NEXT:    addl %esi, %esi
-; SRC-NEXT:    addb $3, %sil
+; SRC-NEXT:    leal (%rsi,%rsi), %edx
+; SRC-NEXT:    addb $3, %dl
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -136,11 +138,11 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movl %esi, %edx
-; SRC-NEXT:    andb $7, %dl
-; SRC-NEXT:    shrb $3, %sil
-; SRC-NEXT:    negb %sil
-; SRC-NEXT:    movsbq %sil, %rsi
+; SRC-NEXT:    movl %edx, %ecx
+; SRC-NEXT:    shrb $3, %cl
+; SRC-NEXT:    andb $24, %cl
+; SRC-NEXT:    negb %cl
+; SRC-NEXT:    movsbq %cl, %rsi
 ; SRC-NEXT:    movq -16(%rsp,%rsi), %rdi
 ; SRC-NEXT:    movq %rdi, %r8
 ; SRC-NEXT:    movl %edx, %ecx
@@ -171,6 +173,7 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    addb $3, %dl
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shrb $3, %cl
+; LIN-NEXT:    andb $24, %cl
 ; LIN-NEXT:    negb %cl
 ; LIN-NEXT:    movsbq %cl, %rsi
 ; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -182,7 +185,6 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; LIN-NEXT:    movq -32(%rsp,%rsi), %rdi
-; LIN-NEXT:    andb $7, %dl
 ; LIN-NEXT:    movq %rdi, %r8
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shlq %cl, %r8
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2..ed1ba5c59e500 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -16,42 +16,36 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    subl $32, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, (%esp)
+; i686-NEXT:    movl %edx, (%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    andb $7, %al
-; i686-NEXT:    shrb $3, %cl
-; i686-NEXT:    andb $15, %cl
-; i686-NEXT:    movzbl %cl, %ebp
-; i686-NEXT:    movl 4(%esp,%ebp), %edx
-; i686-NEXT:    movl %edx, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 8(%esp,%ebp), %ebx
-; i686-NEXT:    leal (%ebx,%ebx), %edi
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %esi, %edi
-; i686-NEXT:    movl (%esp,%ebp), %esi
-; i686-NEXT:    movl 12(%esp,%ebp), %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %ebx
-; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %ebp, 12(%eax)
-; i686-NEXT:    movl %ebx, 8(%eax)
-; i686-NEXT:    movl %esi, (%eax)
+; i686-NEXT:    movl %ecx, %edx
+; i686-NEXT:    shrb $3, %dl
+; i686-NEXT:    andb $12, %dl
+; i686-NEXT:    movzbl %dl, %ebx
+; i686-NEXT:    movl 8(%esp,%ebx), %esi
+; i686-NEXT:    movl (%esp,%ebx), %edx
+; i686-NEXT:    movl 4(%esp,%ebx), %ebp
+; i686-NEXT:    movl %ebp, %edi
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    movl 12(%esp,%ebx), %ebx
+; i686-NEXT:    shrdl %cl, %ebx, %esi
+; i686-NEXT:    shrdl %cl, %ebp, %edx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, %ebx
+; i686-NEXT:    movl %ebx, 12(%eax)
+; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edi, 4(%eax)
+; i686-NEXT:    movl %edx, (%eax)
 ; i686-NEXT:    addl $32, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -87,43 +81,37 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    subl $32, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, (%esp)
+; i686-NEXT:    movl %edx, (%esp)
 ; i686-NEXT:    sarl $31, %ebx
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    andb $7, %al
-; i686-NEXT:    shrb $3, %cl
-; i686-NEXT:    andb $15, %cl
-; i686-NEXT:    movzbl %cl, %ebp
-; i686-NEXT:    movl 4(%esp,%ebp), %edx
-; i686-NEXT:    movl %edx, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 8(%esp,%ebp), %ebx
-; i686-NEXT:    leal (%ebx,%ebx), %edi
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %esi, %edi
-; i686-NEXT:    movl (%esp,%ebp), %esi
-; i686-NEXT:    movl 12(%esp,%ebp), %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %ebx
-; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    sarl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %ebp, 12(%eax)
-; i686-NEXT:    movl %ebx, 8(%eax)
-; i686-NEXT:    movl %esi, (%eax)
+; i686-NEXT:    movl %ecx, %edx
+; i686-NEXT:    shrb $3, %dl
+; i686-NEXT:    andb $12, %dl
+; i686-NEXT:    movzbl %dl, %ebx
+; i686-NEXT:    movl 8(%esp,%ebx), %esi
+; i686-NEXT:    movl (%esp,%ebx), %edx
+; i686-NEXT:    movl 4(%esp,%ebx), %ebp
+; i686-NEXT:    movl %ebp, %edi
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    movl 12(%esp,%ebx), %ebx
+; i686-NEXT:    shrdl %cl, %ebx, %esi
+; i686-NEXT:    shrdl %cl, %ebp, %edx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    sarl %cl, %ebx
+; i686-NEXT:    movl %ebx, 12(%eax)
+; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edi, 4(%eax)
+; i686-NEXT:    movl %edx, (%eax)
 ; i686-NEXT:    addl $32, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -163,44 +151,35 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, (%esp)
-; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    andb $7, %al
-; i686-NEXT:    shrb $3, %cl
-; i686-NEXT:    andb $15, %cl
-; i686-NEXT:    negb %cl
-; i686-NEXT:    movsbl %cl, %ebp
-; i686-NEXT:    movl 24(%esp,%ebp), %ebx
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 20(%esp,%ebp), %edi
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    shrl %esi
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    orl %edx, %esi
-; i686-NEXT:    movl 16(%esp,%ebp), %edx
-; i686-NEXT:    movl 28(%esp,%ebp), %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shldl %cl, %ebx, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ebp, 12(%ecx)
-; i686-NEXT:    movl %edx, %ebx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    shldl %cl, %edx, %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %edi, 4(%eax)
-; i686-NEXT:    movl %ebx, (%eax)
-; i686-NEXT:    movl %esi, 8(%eax)
+; i686-NEXT:    movl %ecx, %edx
+; i686-NEXT:    shrb $3, %dl
+; i686-NEXT:    andb $12, %dl
+; i686-NEXT:    negb %dl
+; i686-NEXT:    movsbl %dl, %edi
+; i686-NEXT:    movl 16(%esp,%edi), %edx
+; i686-NEXT:    movl 20(%esp,%edi), %esi
+; i686-NEXT:    movl 24(%esp,%edi), %ebx
+; i686-NEXT:    movl %ebx, %ebp
+; i686-NEXT:    shldl %cl, %esi, %ebp
+; i686-NEXT:    movl 28(%esp,%edi), %edi
+; i686-NEXT:    shldl %cl, %ebx, %edi
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl %ebp, 8(%eax)
+; i686-NEXT:    movl %edx, %edi
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 4(%eax)
+; i686-NEXT:    movl %edi, (%eax)
 ; i686-NEXT:    addl $32, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -267,13 +246,13 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    subl $92, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
@@ -282,86 +261,70 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:    andl $7, %ecx
+; i686-NEXT:    movl %edx, %eax
+; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 36(%esp,%edx), %edi
+; i686-NEXT:    movl 28(%esp,%edx), %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $15, %esi
-; i686-NEXT:    movl 40(%esp,%esi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 44(%esp,%esi), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    orl %eax, %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 36(%esp,%esi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 32(%esp,%edx), %ebx
+; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 40(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %edx, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    andl $7, %edx
-; i686-NEXT:    shrl $3, %ebx
-; i686-NEXT:    andl $15, %ebx
-; i686-NEXT:    movl 72(%esp,%ebx), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebp
+; i686-NEXT:    movl %esi, %edx
+; i686-NEXT:    andl $31, %edx
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $12, %esi
+; i686-NEXT:    movl 68(%esp,%esi), %ebp
+; i686-NEXT:    movl 64(%esp,%esi), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 76(%esp,%ebx), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    leal (%eax,%eax), %edi
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %ebp, %edi
-; i686-NEXT:    movl 48(%esp,%esi), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebp, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 60(%esp,%esi), %edi
+; i686-NEXT:    movl 72(%esp,%esi), %esi
+; i686-NEXT:    shrdl %cl, %esi, %ebp
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl 68(%esp,%ebx), %ecx
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl 80(%esp,%ebx), %esi
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shrl %cl, %ebp
+; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
 ; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    shrdl %cl, %eax, %edi
 ; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %esi, 28(%ecx)
-; i686-NEXT:    movl %ebx, 24(%ecx)
-; i686-NEXT:    movl (%esp), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 16(%ecx)
-; i686-NEXT:    movl %ebp, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %edi, 20(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    addl $100, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %esi, 28(%eax)
+; i686-NEXT:    movl %ebp, 24(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    movl %edi, 16(%eax)
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -406,102 +369,85 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    sarl $31, %eax
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:    andl $7, %ebx
-; i686-NEXT:    shrl $3, %edi
-; i686-NEXT:    andl $15, %edi
-; i686-NEXT:    movl 32(%esp,%edi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 36(%esp,%edi), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    orl %eax, %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebp, %eax
-; i686-NEXT:    movl %ebp, %edx
-; i686-NEXT:    andl $7, %edx
-; i686-NEXT:    shrl $3, %eax
-; i686-NEXT:    andl $15, %eax
-; i686-NEXT:    movl 64(%esp,%eax), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 68(%esp,%eax), %esi
-; i686-NEXT:    leal (%esi,%esi), %eax
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    orl %ebp, %eax
-; i686-NEXT:    movl 28(%esp,%edi), %ecx
+; i686-NEXT:    movl %edx, %eax
+; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 36(%esp,%edx), %edi
+; i686-NEXT:    movl 28(%esp,%edx), %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 40(%esp,%edi), %edi
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl 60(%esp,%ecx), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 72(%esp,%ecx), %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %esi
-; i686-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    sarl %cl, %edi
+; i686-NEXT:    movl 32(%esp,%edx), %ebx
+; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 40(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %edx, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %esi, %edx
+; i686-NEXT:    andl $31, %edx
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $12, %esi
+; i686-NEXT:    movl 68(%esp,%esi), %ebp
+; i686-NEXT:    movl 64(%esp,%esi), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebp, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 60(%esp,%esi), %edi
+; i686-NEXT:    movl 72(%esp,%esi), %esi
+; i686-NEXT:    shrdl %cl, %esi, %ebp
+; i686-NEXT:    movl %eax, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    sarl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ebp, 28(%ecx)
-; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 24(%ecx)
-; i686-NEXT:    movl %ebx, 16(%ecx)
-; i686-NEXT:    movl %edi, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %eax, 20(%ecx)
+; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    sarl %cl, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
+; i686-NEXT:    shrdl %cl, %eax, %edi
+; i686-NEXT:    sarl %cl, %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %esi, 28(%eax)
+; i686-NEXT:    movl %ebp, 24(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    movl %edi, 16(%eax)
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
 ; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -550,107 +496,97 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:    shrl $3, %ebp
-; i686-NEXT:    andl $15, %ebp
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    subl %ebp, %eax
+; i686-NEXT:    movl %ecx, %ebx
+; i686-NEXT:    shrl $3, %ebx
+; i686-NEXT:    andl $12, %ebx
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    subl %ebx, %edx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%eax), %edx
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    andl $7, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    movl 4(%eax), %esi
+; i686-NEXT:    movl (%edx), %esi
 ; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %esi
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    orl %edx, %esi
+; i686-NEXT:    movl 4(%edx), %esi
 ; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl (%eax), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $15, %edx
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    subl %edx, %esi
+; i686-NEXT:    movl 8(%edx), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    andl $31, %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %eax, %ebp
+; i686-NEXT:    shrl $3, %ebp
+; i686-NEXT:    andl $12, %ebp
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    subl %ebp, %ecx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    andl $7, %ebx
-; i686-NEXT:    movl 8(%esi), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    movl 4(%esi), %eax
+; i686-NEXT:    movl (%ecx), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 4(%ecx), %edi
+; i686-NEXT:    movl 8(%ecx), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shldl %cl, %edi, %esi
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, %eax
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    shll %cl, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %eax
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    notl %ecx
+; i686-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    negl %ebx
+; i686-NEXT:    movl 64(%esp,%ebx), %ebx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    orl %edi, %eax
-; i686-NEXT:    movl (%esi), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    shldl %cl, %eax, %ebx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, %edx
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %eax, %edi
 ; i686-NEXT:    negl %ebp
-; i686-NEXT:    movl 64(%esp,%ebp), %esi
+; i686-NEXT:    movl 96(%esp,%ebp), %ebp
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    movl (%esp), %edi # 4-byte Reload
-; i686-NEXT:    shldl %cl, %edi, %esi
-; i686-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shldl %cl, %esi, %ebp
-; i686-NEXT:    negl %edx
-; i686-NEXT:    movl 96(%esp,%edx), %edx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shldl %cl, %ebx, %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %edx, 28(%ecx)
-; i686-NEXT:    movl %ebp, 20(%ecx)
-; i686-NEXT:    movl %edi, 16(%ecx)
-; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 4(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %eax, 24(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 8(%ecx)
+; i686-NEXT:    shldl %cl, %eax, %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %ebp, 28(%eax)
+; i686-NEXT:    movl %esi, 24(%eax)
+; i686-NEXT:    movl %edi, 20(%eax)
+; i686-NEXT:    movl %edx, 16(%eax)
+; i686-NEXT:    movl %ebx, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
 ; i686-NEXT:    addl $100, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index e1466aebf4225..bf159acc43f91 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -11,7 +11,7 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $92, %esp
+; CHECK-NEXT:    subl $80, %esp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -39,67 +39,43 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    andb $7, %al
-; CHECK-NEXT:    shrb $3, %cl
-; CHECK-NEXT:    movzbl %cl, %ebp
-; CHECK-NEXT:    movl 32(%esp,%ebp), %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl %cl, %esi
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    notb %dl
-; CHECK-NEXT:    movl 36(%esp,%ebp), %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    leal (%ecx,%ecx), %edi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shll %cl, %edi
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 40(%esp,%ebp), %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl %cl, %esi
-; CHECK-NEXT:    movl 44(%esp,%ebp), %ecx
-; CHECK-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT:    leal (%ecx,%ecx), %edi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shll %cl, %edi
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 48(%esp,%ebp), %ebx
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl %cl, %ebx
-; CHECK-NEXT:    movl 52(%esp,%ebp), %edi
-; CHECK-NEXT:    leal (%edi,%edi), %esi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    orl %ebx, %esi
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT:    movl 28(%esp,%ebp), %edx
-; CHECK-NEXT:    movl 56(%esp,%ebp), %ebx
-; CHECK-NEXT:    shrdl %cl, %ebx, %edi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %ebp, %edx
-; CHECK-NEXT:    sarl %cl, %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %ebx, 28(%eax)
-; CHECK-NEXT:    movl %edi, 24(%eax)
-; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 16(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    movl %edx, (%eax)
-; CHECK-NEXT:    movl %esi, 20(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 12(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    addl $92, %esp
+; CHECK-NEXT:    shrb $5, %al
+; CHECK-NEXT:    movzbl %al, %ebp
+; CHECK-NEXT:    movl 24(%esp,%ebp,4), %eax
+; CHECK-NEXT:    movl 20(%esp,%ebp,4), %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shrdl %cl, %eax, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 28(%esp,%ebp,4), %edx
+; CHECK-NEXT:    shrdl %cl, %edx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; CHECK-NEXT:    shrdl %cl, %ebx, %edx
+; CHECK-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; CHECK-NEXT:    movl 36(%esp,%ebp,4), %edx
+; CHECK-NEXT:    shrdl %cl, %edx, %ebx
+; CHECK-NEXT:    movl 40(%esp,%ebp,4), %eax
+; CHECK-NEXT:    shrdl %cl, %eax, %edx
+; CHECK-NEXT:    movl 16(%esp,%ebp,4), %esi
+; CHECK-NEXT:    movl 44(%esp,%ebp,4), %ebp
+; CHECK-NEXT:    shrdl %cl, %ebp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    shrdl %cl, %edi, %esi
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ebp, 28(%ecx)
+; CHECK-NEXT:    movl %eax, 24(%ecx)
+; CHECK-NEXT:    movl %edx, 20(%ecx)
+; CHECK-NEXT:    movl %ebx, 16(%ecx)
+; CHECK-NEXT:    movl (%esp), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 12(%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 8(%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 4(%ecx)
+; CHECK-NEXT:    movl %esi, (%ecx)
+; CHECK-NEXT:    addl $80, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -120,42 +96,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movb %r8b, %dl
-; CHECK-X64-O0-NEXT:    movb %dl, %cl
-; CHECK-X64-O0-NEXT:    andb $7, %cl
+; CHECK-X64-O0-NEXT:    movb %r8b, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-X64-O0-NEXT:    shrb $3, %dl
+; CHECK-X64-O0-NEXT:    movb %cl, %dl
+; CHECK-X64-O0-NEXT:    shrb $6, %dl
 ; CHECK-X64-O0-NEXT:    movzbl %dl, %edx
 ; CHECK-X64-O0-NEXT:    movl %edx, %edi
-; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi), %rdx
-; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi), %r8
-; CHECK-X64-O0-NEXT:    movq %r8, %r9
-; CHECK-X64-O0-NEXT:    shrq %cl, %r9
+; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT:    movq %r9, %rdx
+; CHECK-X64-O0-NEXT:    shrdq %cl, %rsi, %rdx
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    notb %cl
-; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi), %rsi
-; CHECK-X64-O0-NEXT:    movq %rsi, %r10
-; CHECK-X64-O0-NEXT:    addq %r10, %r10
-; CHECK-X64-O0-NEXT:    shlq %cl, %r10
-; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    orq %r10, %r9
-; CHECK-X64-O0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT:    movq -40(%rsp,%rdi), %rdi
+; CHECK-X64-O0-NEXT:    movq -40(%rsp,%rdi,8), %rdi
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %rdi, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    shrdq %cl, %r8, %rdx
+; CHECK-X64-O0-NEXT:    shrdq %cl, %r9, %r8
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-X64-O0-NEXT:    sarq %cl, %rdi
 ; CHECK-X64-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-X64-O0-NEXT:    movq %rdi, 24(%rax)
 ; CHECK-X64-O0-NEXT:    movq %rsi, 16(%rax)
-; CHECK-X64-O0-NEXT:    movq %rdx, (%rax)
-; CHECK-X64-O0-NEXT:    movq %rcx, 8(%rax)
+; CHECK-X64-O0-NEXT:    movq %rdx, 8(%rax)
+; CHECK-X64-O0-NEXT:    movq %rcx, (%rax)
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift1:
 ; CHECK-X64-O2:       # %bb.0: # %entry
-; CHECK-X64-O2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; CHECK-X64-O2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
@@ -165,29 +134,23 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movl %r8d, %eax
-; CHECK-X64-O2-NEXT:    andb $7, %al
-; CHECK-X64-O2-NEXT:    shrb $3, %r8b
-; CHECK-X64-O2-NEXT:    movzbl %r8b, %edx
-; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT:    movq %rdi, %r8
-; CHECK-X64-O2-NEXT:    movl %eax, %ecx
-; CHECK-X64-O2-NEXT:    shrq %cl, %r8
-; CHECK-X64-O2-NEXT:    notb %cl
-; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rdx), %r10
-; CHECK-X64-O2-NEXT:    leaq (%r10,%r10), %r11
-; CHECK-X64-O2-NEXT:    shlq %cl, %r11
-; CHECK-X64-O2-NEXT:    orq %r8, %r11
-; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT:    movl %eax, %ecx
-; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %r10
-; CHECK-X64-O2-NEXT:    shrdq %cl, %rdi, %rsi
+; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
+; CHECK-X64-O2-NEXT:    shrb $6, %cl
+; CHECK-X64-O2-NEXT:    movzbl %cl, %edx
+; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT:    movq %r9, %r10
+; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
+; CHECK-X64-O2-NEXT:    shrdq %cl, %rsi, %r10
+; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %rsi
+; CHECK-X64-O2-NEXT:    shrdq %cl, %r9, %rdi
 ; CHECK-X64-O2-NEXT:    sarq %cl, %rdx
-; CHECK-X64-O2-NEXT:    movq %rdx, 24(%r9)
-; CHECK-X64-O2-NEXT:    movq %r10, 16(%r9)
-; CHECK-X64-O2-NEXT:    movq %rsi, (%r9)
-; CHECK-X64-O2-NEXT:    movq %r11, 8(%r9)
+; CHECK-X64-O2-NEXT:    movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT:    movq %rsi, 16(%rax)
+; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
 ; CHECK-X64-O2-NEXT:    retq
 entry:
 	%0 = ashr i256 %x, %a
@@ -202,8 +165,8 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $92, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    subl $80, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -220,68 +183,52 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movb %al, %ch
-; CHECK-NEXT:    andb $7, %ch
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    andb $28, %al
 ; CHECK-NEXT:    negb %al
 ; CHECK-NEXT:    movsbl %al, %eax
-; CHECK-NEXT:    movl 68(%esp,%eax), %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %edx
-; CHECK-NEXT:    notb %cl
-; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movl 64(%esp,%eax), %ebp
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shrl %ebp
-; CHECK-NEXT:    shrl %cl, %ebp
-; CHECK-NEXT:    orl %edx, %ebp
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 76(%esp,%eax), %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %edx
-; CHECK-NEXT:    movl 72(%esp,%eax), %ebx
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shrl %ebx
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT:    shrl %cl, %ebx
-; CHECK-NEXT:    orl %edx, %ebx
-; CHECK-NEXT:    movl 84(%esp,%eax), %esi
+; CHECK-NEXT:    movl 52(%esp,%eax), %esi
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    movl 80(%esp,%eax), %edi
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shrl %edx
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT:    shrl %cl, %edx
-; CHECK-NEXT:    orl %esi, %edx
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl 56(%esp,%eax), %edx
+; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    shldl %cl, %esi, %edi
-; CHECK-NEXT:    movl 60(%esp,%eax), %ebp
-; CHECK-NEXT:    movl 88(%esp,%eax), %esi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %eax, %esi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 60(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    shldl %cl, %edx, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 64(%esp,%eax), %edx
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    shldl %cl, %esi, %ebp
+; CHECK-NEXT:    movl 68(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    shldl %cl, %edx, %ebx
+; CHECK-NEXT:    movl 72(%esp,%eax), %edi
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    shldl %cl, %esi, %edx
+; CHECK-NEXT:    movl 48(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; CHECK-NEXT:    movl 76(%esp,%eax), %esi
+; CHECK-NEXT:    shldl %cl, %edi, %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %esi, 28(%eax)
-; CHECK-NEXT:    movl %edi, 20(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT:    movl %esi, 12(%eax)
-; CHECK-NEXT:    movl %ebp, %esi
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %ebp, %edi
-; CHECK-NEXT:    movl %edi, 4(%eax)
-; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    movl %edx, 24(%eax)
-; CHECK-NEXT:    movl %ebx, 16(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    addl $92, %esp
+; CHECK-NEXT:    movl %ebx, 20(%eax)
+; CHECK-NEXT:    movl %ebp, 16(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, 12(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, 8(%eax)
+; CHECK-NEXT:    movl (%esp), %edi # 4-byte Reload
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    shldl %cl, %edi, %esi
+; CHECK-NEXT:    movl %esi, 4(%eax)
+; CHECK-NEXT:    movl %edx, (%eax)
+; CHECK-NEXT:    addl $80, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -299,42 +246,37 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movb %sil, %dl
-; CHECK-X64-O0-NEXT:    movb %dl, %cl
-; CHECK-X64-O0-NEXT:    andb $7, %cl
+; CHECK-X64-O0-NEXT:    movb %sil, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-X64-O0-NEXT:    movb %cl, %dl
 ; CHECK-X64-O0-NEXT:    shrb $3, %dl
+; CHECK-X64-O0-NEXT:    andb $24, %dl
 ; CHECK-X64-O0-NEXT:    negb %dl
-; CHECK-X64-O0-NEXT:    movsbq %dl, %rdx
-; CHECK-X64-O0-NEXT:    movq -16(%rsp,%rdx), %rsi
-; CHECK-X64-O0-NEXT:    movq %rsi, %r10
-; CHECK-X64-O0-NEXT:    shlq %cl, %r10
+; CHECK-X64-O0-NEXT:    movsbq %dl, %r8
+; CHECK-X64-O0-NEXT:    movq -32(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT:    movq -24(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT:    movq -16(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT:    movq %r10, %rsi
+; CHECK-X64-O0-NEXT:    shldq %cl, %rdx, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    notb %cl
-; CHECK-X64-O0-NEXT:    movq -32(%rsp,%rdx), %r9
-; CHECK-X64-O0-NEXT:    movq -24(%rsp,%rdx), %r8
-; CHECK-X64-O0-NEXT:    movq %r8, %r11
-; CHECK-X64-O0-NEXT:    shrq %r11
-; CHECK-X64-O0-NEXT:    shrq %cl, %r11
+; CHECK-X64-O0-NEXT:    movq -8(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT:    shldq %cl, %r10, %r8
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    orq %r11, %r10
-; CHECK-X64-O0-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT:    movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O0-NEXT:    shldq %cl, %rsi, %rdx
-; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    movq %r9, %rsi
-; CHECK-X64-O0-NEXT:    shlq %cl, %rsi
+; CHECK-X64-O0-NEXT:    movq %r9, %r10
+; CHECK-X64-O0-NEXT:    shlq %cl, %r10
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    shldq %cl, %r9, %r8
+; CHECK-X64-O0-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-X64-O0-NEXT:    shldq %cl, %r9, %rdx
 ; CHECK-X64-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-X64-O0-NEXT:    movq %r8, 8(%rdi)
-; CHECK-X64-O0-NEXT:    movq %rsi, (%rdi)
-; CHECK-X64-O0-NEXT:    movq %rdx, 24(%rdi)
-; CHECK-X64-O0-NEXT:    movq %rcx, 16(%rdi)
+; CHECK-X64-O0-NEXT:    movq %r8, 24(%rdi)
+; CHECK-X64-O0-NEXT:    movq %rsi, 16(%rdi)
+; CHECK-X64-O0-NEXT:    movq %rdx, 8(%rdi)
+; CHECK-X64-O0-NEXT:    movq %rcx, (%rdi)
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift2:
 ; CHECK-X64-O2:       # %bb.0:
+; CHECK-X64-O2-NEXT:    movq %rsi, %rcx
 ; CHECK-X64-O2-NEXT:    movq %rdi, %rax
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -344,32 +286,26 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movl %esi, %edx
-; CHECK-X64-O2-NEXT:    andb $7, %dl
-; CHECK-X64-O2-NEXT:    shrb $3, %sil
-; CHECK-X64-O2-NEXT:    negb %sil
-; CHECK-X64-O2-NEXT:    movsbq %sil, %rsi
-; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rsi), %rdi
-; CHECK-X64-O2-NEXT:    movq %rdi, %r8
-; CHECK-X64-O2-NEXT:    movl %edx, %ecx
+; CHECK-X64-O2-NEXT:    movl %ecx, %edx
+; CHECK-X64-O2-NEXT:    shrb $3, %dl
+; CHECK-X64-O2-NEXT:    andb $24, %dl
+; CHECK-X64-O2-NEXT:    negb %dl
+; CHECK-X64-O2-NEXT:    movsbq %dl, %rdx
+; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT:    movq %r8, %r9
+; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %r9
+; CHECK-X64-O2-NEXT:    movq -8(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT:    shldq %cl, %r8, %rdx
+; CHECK-X64-O2-NEXT:    movq %rsi, %r8
 ; CHECK-X64-O2-NEXT:    shlq %cl, %r8
-; CHECK-X64-O2-NEXT:    notb %cl
-; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rsi), %r9
-; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rsi), %r10
-; CHECK-X64-O2-NEXT:    movq %r10, %r11
-; CHECK-X64-O2-NEXT:    shrq %r11
-; CHECK-X64-O2-NEXT:    shrq %cl, %r11
-; CHECK-X64-O2-NEXT:    orq %r8, %r11
-; CHECK-X64-O2-NEXT:    movq -8(%rsp,%rsi), %rsi
-; CHECK-X64-O2-NEXT:    movl %edx, %ecx
-; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %rsi
-; CHECK-X64-O2-NEXT:    movq %r9, %rdi
-; CHECK-X64-O2-NEXT:    shlq %cl, %rdi
-; CHECK-X64-O2-NEXT:    shldq %cl, %r9, %r10
-; CHECK-X64-O2-NEXT:    movq %rsi, 24(%rax)
-; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
-; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
-; CHECK-X64-O2-NEXT:    movq %r11, 16(%rax)
+; CHECK-X64-O2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-X64-O2-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT:    movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT:    movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT:    movq %rdi, 8(%rax)
+; CHECK-X64-O2-NEXT:    movq %r8, (%rax)
 ; CHECK-X64-O2-NEXT:    retq
 {
   %b = shl i256 1, %c  ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc8797..5c9c81758d633 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,61 +588,60 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ah
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $15, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %ah, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -658,44 +657,35 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb (%eax), %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %ah, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
@@ -718,43 +708,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %bl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -771,43 +761,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -899,66 +884,64 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebp), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -967,58 +950,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -1033,28 +1005,28 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %al, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
@@ -1081,57 +1053,47 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1218,62 +1180,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $15, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1289,45 +1250,36 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
@@ -1349,45 +1301,45 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1404,44 +1356,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -1459,36 +1406,36 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -1496,142 +1443,127 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1640,17 +1572,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
@@ -1662,7 +1594,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1672,95 +1604,94 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1773,7 +1704,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
@@ -1806,67 +1737,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1880,31 +1788,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1913,68 +1819,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1988,32 +1893,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -2022,58 +1925,46 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2089,31 +1980,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    negb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %sil, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    negb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r10), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r10), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -2146,79 +2038,72 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %sil, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %sil, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %rax
@@ -2226,50 +2111,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %sil, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2278,7 +2154,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $84, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
@@ -2288,7 +2164,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
@@ -2310,79 +2186,78 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $28, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ah
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2398,7 +2273,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $84, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2411,7 +2286,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
@@ -2444,69 +2319,45 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 24(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2520,31 +2371,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $88, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -2553,66 +2402,72 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edx), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 84(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 84(%esp,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
@@ -2631,32 +2486,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -2665,61 +2518,48 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%esi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2735,36 +2575,36 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
@@ -2773,145 +2613,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rsi, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -2920,17 +2745,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
@@ -2942,7 +2767,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -2953,95 +2778,94 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3054,7 +2878,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
@@ -3088,67 +2912,44 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3162,104 +2963,99 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -3273,93 +3069,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3407,9 +3189,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
@@ -3417,7 +3199,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
@@ -3426,7 +3207,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
@@ -3488,22 +3269,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -3511,8 +3289,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, (%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -3520,64 +3296,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3606,7 +3360,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -3616,42 +3370,41 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
@@ -3662,10 +3415,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -3676,11 +3429,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
@@ -3699,8 +3449,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -3709,51 +3457,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3762,42 +3499,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3806,8 +3543,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3816,6 +3551,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -3834,196 +3571,195 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4036,62 +3772,62 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -4108,137 +3844,92 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4251,42 +3942,44 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $184, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4307,8 +4000,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -4326,146 +4019,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 56(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $184, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4478,7 +4163,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4489,7 +4174,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
@@ -4499,7 +4184,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
@@ -4514,7 +4199,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -4524,7 +4209,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -4534,8 +4219,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -4552,120 +4235,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4680,7 +4333,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
@@ -4712,100 +4364,91 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r14), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r14), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r14), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r14), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r14), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%rbx), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -4815,7 +4458,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
@@ -4831,69 +4474,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r10), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r10), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r10), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r10), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r10), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r10), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%r10), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r12, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -4922,7 +4537,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -4932,57 +4547,54 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r13b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rcx), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %r11, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
@@ -4994,11 +4606,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
@@ -5017,8 +4625,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -5027,56 +4633,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbp, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbp, %r12, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r11, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5086,41 +4676,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5137,13 +4727,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %ebp
@@ -5163,174 +4755,165 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, (%esp) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5366,67 +4949,67 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %edx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -5443,136 +5026,87 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 160(%esp,%edx), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 60(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 52(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5585,45 +5119,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $216, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5641,13 +5175,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -5664,156 +5198,139 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, (%esp), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 212(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $216, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $192, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5826,41 +5343,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $180, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ebx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5882,13 +5399,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %ebx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -5905,126 +5421,88 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%esi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 164(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $180, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -6072,9 +5550,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
@@ -6082,7 +5560,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
@@ -6091,7 +5568,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
@@ -6153,22 +5630,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -6176,74 +5650,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6281,44 +5731,43 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
@@ -6329,10 +5778,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -6343,11 +5792,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
@@ -6376,52 +5822,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6430,12 +5863,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
@@ -6443,7 +5876,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
@@ -6452,19 +5885,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6473,7 +5906,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6482,7 +5915,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -6503,196 +5936,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6705,7 +6137,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6718,7 +6150,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
@@ -6726,189 +6158,144 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -6921,7 +6308,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $188, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6942,201 +6329,201 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 60(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -7149,7 +6536,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -7158,7 +6545,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
@@ -7170,173 +6557,142 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx

>From 306ebacce1d4bf026c47a9adf65981f9a1e15434 Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Tue, 25 Jun 2024 15:02:18 +0200
Subject: [PATCH 3/4] Addressing review comments

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index cd40df473c67c..015d3b412715b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4542,8 +4542,10 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
     unsigned IsFast = 0;
     const bool AllowsFastMisalignedMemoryAccesses =
         TLI.allowsMisalignedMemoryAccesses(
-            LoadStoreVT, /*AddrSpace*/ 0, /*Alignment*/ Align(1),
-            /*Flags*/ MachineMemOperand::MONone, &IsFast) &&
+            LoadStoreVT, /*AddrSpace=*/DAG.getDataLayout().getAllocaAddrSpace(),
+            /*Alignment=*/Align(LoadStoreVT.getStoreSize()),
+            /*Flags=*/MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
+            &IsFast) &&
         IsFast;
     if (AllowsFastMisalignedMemoryAccesses && KnownTrailingZeros >= 3)
       return Align(1);
@@ -4552,9 +4554,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   }();
 
   const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
-  const bool IsOneStepShift =
-      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
-      Log2_32(ShiftUnitInBits);
+  const bool IsOneStepShift = KnownTrailingZeros >= Log2_32(ShiftUnitInBits);
 
   // If we can't do it as one step, we'll have two uses of shift amount,
   // and thus must freeze it.

>From 5794d2c4a61d932b38cf8453c3cf33fafdd8e37f Mon Sep 17 00:00:00 2001
From: Gergely Futo <gergely.futo at hightec-rt.com>
Date: Wed, 3 Jul 2024 08:24:41 +0200
Subject: [PATCH 4/4] Use native register width as shifting unit

---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |    40 +-
 ...lar-shift-by-byte-multiple-legalization.ll |   120 +-
 .../AArch64/wide-scalar-shift-legalization.ll |   118 +-
 llvm/test/CodeGen/Mips/llvm-ir/ashr.ll        |    64 +-
 llvm/test/CodeGen/Mips/llvm-ir/lshr.ll        |   180 +-
 llvm/test/CodeGen/Mips/llvm-ir/shl.ll         |   122 +-
 llvm/test/CodeGen/PowerPC/ctrloop-sh.ll       |    54 +-
 llvm/test/CodeGen/PowerPC/pr59074.ll          |    11 +-
 ...lar-shift-by-byte-multiple-legalization.ll |   734 +-
 .../PowerPC/wide-scalar-shift-legalization.ll |   272 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |    52 +-
 ...lar-shift-by-byte-multiple-legalization.ll |   408 +-
 .../RISCV/wide-scalar-shift-legalization.ll   |   396 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |   502 +-
 .../div-rem-pair-recomposition-unsigned.ll    |   447 +-
 llvm/test/CodeGen/X86/pr38539.ll              |   102 +-
 .../CodeGen/X86/scheduler-backtracking.ll     |   170 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |   565 +-
 llvm/test/CodeGen/X86/shift-i256.ll           |   278 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 21282 ++++++++++++++--
 .../X86/wide-scalar-shift-legalization.ll     |  4567 ++--
 ...ad-of-small-alloca-with-zero-upper-half.ll |  3575 ++-
 .../CodeGen/X86/widen-load-of-small-alloca.ll |  1675 +-
 23 files changed, 27988 insertions(+), 7746 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 015d3b412715b..a4dfb89828ebe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4535,26 +4535,12 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
     LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
   } while (!TLI.isTypeLegal(LoadStoreVT));
 
-  const unsigned KnownTrailingZeros =
-      DAG.computeKnownBits(ShAmt).countMinTrailingZeros();
-
-  const Align LoadStoreAlign = [&]() -> Align {
-    unsigned IsFast = 0;
-    const bool AllowsFastMisalignedMemoryAccesses =
-        TLI.allowsMisalignedMemoryAccesses(
-            LoadStoreVT, /*AddrSpace=*/DAG.getDataLayout().getAllocaAddrSpace(),
-            /*Alignment=*/Align(LoadStoreVT.getStoreSize()),
-            /*Flags=*/MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
-            &IsFast) &&
-        IsFast;
-    if (AllowsFastMisalignedMemoryAccesses && KnownTrailingZeros >= 3)
-      return Align(1);
-
-    return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
-  }();
-
-  const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
-  const bool IsOneStepShift = KnownTrailingZeros >= Log2_32(ShiftUnitInBits);
+  const unsigned ShiftUnitInBits = LoadStoreVT.getStoreSize() * 8;
+  assert(isPowerOf2_32(ShiftUnitInBits) &&
+         "Shifting unit is not a a power of two!");
+  const bool IsOneStepShift =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
+      Log2_32(ShiftUnitInBits);
 
   // If we can't do it as one step, we'll have two uses of shift amount,
   // and thus must freeze it.
@@ -4572,9 +4558,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
 
   // Get a temporary stack slot 2x the width of our VT.
   // FIXME: reuse stack slots?
-  Align StackSlotAlignment(LoadStoreAlign);
-  SDValue StackPtr = DAG.CreateStackTemporary(
-      TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+  SDValue StackPtr = DAG.CreateStackTemporary(StackSlotVT);
   EVT PtrTy = StackPtr.getValueType();
   SDValue Ch = DAG.getEntryNode();
 
@@ -4594,7 +4578,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
     Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
   }
   // And spill it into the stack slot.
-  Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
+  Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo);
 
   // Now, compute the full-byte offset into stack slot from where we can load.
   // We have shift amount, which is in bits. Offset should point to an aligned
@@ -4640,11 +4624,9 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
 
   // And load it! While the load is not legal, legalizing it is obvious.
-  SDValue Res =
-      DAG.getLoad(VT, dl, Ch, AdjStackPtr,
-                  MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
-                  LoadStoreAlign);
-  // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
+  SDValue Res = DAG.getLoad(
+      VT, dl, Ch, AdjStackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
 
   // If we may still have a remaining bits to shift by, do so now.
   if (!IsOneStepShift) {
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index e21015ad3db30..4f46f7731e257 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -179,21 +179,36 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldp x9, x8, [x0, #16]
-; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    ldp x8, x9, [x0, #16]
 ; ALL-NEXT:    ldr x10, [x1]
-; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    movi v1.2d, #0000000000000000
+; ALL-NEXT:    str q0, [sp, #-64]!
+; ALL-NEXT:    stp x8, x9, [sp, #16]
 ; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    and x9, x10, #0x1f
-; ALL-NEXT:    str q1, [sp]
+; ALL-NEXT:    and x9, x10, #0x18
+; ALL-NEXT:    stp q1, q1, [sp, #32]
 ; ALL-NEXT:    add x8, x8, x9
-; ALL-NEXT:    stp q0, q0, [sp, #32]
-; ALL-NEXT:    ldp x10, x9, [x8, #16]
-; ALL-NEXT:    ldr q0, [x8]
-; ALL-NEXT:    str q0, [x2]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
+; ALL-NEXT:    lsl x9, x10, #3
+; ALL-NEXT:    ldp x11, x10, [x8, #16]
+; ALL-NEXT:    ldp x8, x12, [x8]
+; ALL-NEXT:    mvn w13, w9
+; ALL-NEXT:    and x9, x9, #0x38
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsr x11, x11, x9
+; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    lsr x10, x10, x9
+; ALL-NEXT:    lsr x12, x12, x9
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    lsl x9, x16, x13
+; ALL-NEXT:    lsl x13, x15, x13
+; ALL-NEXT:    orr x11, x14, x11
+; ALL-NEXT:    orr x8, x9, x8
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x11, x10, [x2, #16]
+; ALL-NEXT:    stp x8, x9, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -207,21 +222,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldp x8, x9, [x0, #16]
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    stp x9, x8, [sp, #48]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    and x9, x10, #0x1f
-; ALL-NEXT:    add x8, x8, #32
-; ALL-NEXT:    stp q0, q0, [sp]
-; ALL-NEXT:    str q1, [sp, #32]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    ldp x9, x10, [x8, #16]
-; ALL-NEXT:    ldr q0, [x8]
-; ALL-NEXT:    str q0, [x2]
-; ALL-NEXT:    stp x9, x10, [x2, #16]
+; ALL-NEXT:    mov x11, sp
+; ALL-NEXT:    add x11, x11, #32
+; ALL-NEXT:    and x12, x10, #0x18
+; ALL-NEXT:    stp x8, x9, [sp, #48]
+; ALL-NEXT:    lsl x9, x10, #3
+; ALL-NEXT:    stp q0, q1, [sp, #16]
+; ALL-NEXT:    sub x8, x11, x12
+; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    mvn w13, w9
+; ALL-NEXT:    and x9, x9, #0x38
+; ALL-NEXT:    ldp x10, x11, [x8]
+; ALL-NEXT:    ldp x12, x8, [x8, #16]
+; ALL-NEXT:    lsr x14, x10, #1
+; ALL-NEXT:    lsr x15, x11, #1
+; ALL-NEXT:    lsl x11, x11, x9
+; ALL-NEXT:    lsr x16, x12, #1
+; ALL-NEXT:    lsl x10, x10, x9
+; ALL-NEXT:    lsl x12, x12, x9
+; ALL-NEXT:    lsr x14, x14, x13
+; ALL-NEXT:    lsl x8, x8, x9
+; ALL-NEXT:    lsr x9, x16, x13
+; ALL-NEXT:    lsr x13, x15, x13
+; ALL-NEXT:    orr x11, x11, x14
+; ALL-NEXT:    orr x8, x8, x9
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x10, x11, [x2]
+; ALL-NEXT:    stp x9, x8, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -234,22 +265,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldr q0, [x0]
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    ldr x10, [x1]
-; ALL-NEXT:    ldr q0, [x0]
-; ALL-NEXT:    and x10, x10, #0x1f
+; ALL-NEXT:    str q0, [sp, #-64]!
+; ALL-NEXT:    asr x11, x8, #63
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
-; ALL-NEXT:    asr x8, x8, #63
-; ALL-NEXT:    mov x9, sp
-; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    stp x8, x8, [sp, #48]
-; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    add x8, x9, x10
-; ALL-NEXT:    ldp x10, x9, [x8, #16]
-; ALL-NEXT:    ldr q0, [x8]
-; ALL-NEXT:    str q0, [x2]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    and x9, x10, #0x18
+; ALL-NEXT:    stp x11, x11, [sp, #48]
+; ALL-NEXT:    add x8, x8, x9
+; ALL-NEXT:    lsl x9, x10, #3
+; ALL-NEXT:    stp x11, x11, [sp, #32]
+; ALL-NEXT:    ldp x11, x10, [x8, #16]
+; ALL-NEXT:    mvn w13, w9
+; ALL-NEXT:    ldp x8, x12, [x8]
+; ALL-NEXT:    and x9, x9, #0x38
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsr x11, x11, x9
+; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    asr x10, x10, x9
+; ALL-NEXT:    lsr x12, x12, x9
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    lsl x9, x16, x13
+; ALL-NEXT:    lsl x13, x15, x13
+; ALL-NEXT:    orr x11, x14, x11
+; ALL-NEXT:    orr x8, x9, x8
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x11, x10, [x2, #16]
+; ALL-NEXT:    stp x8, x9, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index 531e0fa740da7..faf3602791bbe 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -154,39 +154,38 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldp x9, x8, [x0, #16]
-; ALL-NEXT:    movi v0.2d, #0000000000000000
-; ALL-NEXT:    ldr x10, [x1]
-; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    stp x9, x8, [sp, #16]
-; ALL-NEXT:    lsr x8, x10, #3
-; ALL-NEXT:    mov x9, sp
-; ALL-NEXT:    str q1, [sp]
-; ALL-NEXT:    and x12, x10, #0x3f
-; ALL-NEXT:    and x8, x8, #0x18
-; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    ldp x8, x10, [x0, #16]
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    movi v1.2d, #0000000000000000
+; ALL-NEXT:    str q0, [sp, #-64]!
+; ALL-NEXT:    lsr x11, x9, #3
+; ALL-NEXT:    stp x8, x10, [sp, #16]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    stp q1, q1, [sp, #32]
+; ALL-NEXT:    and x12, x9, #0x3f
+; ALL-NEXT:    and x10, x11, #0x18
 ; ALL-NEXT:    eor x12, x12, #0x3f
-; ALL-NEXT:    add x8, x9, x8
+; ALL-NEXT:    add x8, x8, x10
 ; ALL-NEXT:    ldp x13, x11, [x8]
-; ALL-NEXT:    ldr x9, [x8, #24]
+; ALL-NEXT:    ldr x10, [x8, #24]
 ; ALL-NEXT:    ldr x8, [x8, #16]
-; ALL-NEXT:    lsl x14, x9, #1
-; ALL-NEXT:    lsr x9, x9, x10
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsr x10, x10, x9
 ; ALL-NEXT:    lsl x15, x11, #1
-; ALL-NEXT:    lsr x11, x11, x10
-; ALL-NEXT:    lsr x13, x13, x10
+; ALL-NEXT:    lsr x11, x11, x9
+; ALL-NEXT:    lsr x13, x13, x9
 ; ALL-NEXT:    lsl x14, x14, x12
 ; ALL-NEXT:    lsl x12, x15, x12
 ; ALL-NEXT:    lsl x15, x8, #1
-; ALL-NEXT:    lsr x8, x8, x10
-; ALL-NEXT:    mvn w10, w10
-; ALL-NEXT:    lsl x10, x15, x10
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    mvn w9, w9
+; ALL-NEXT:    lsl x9, x15, x9
 ; ALL-NEXT:    orr x8, x14, x8
-; ALL-NEXT:    stp x8, x9, [x2, #16]
-; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    orr x8, x11, x10
-; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    stp x8, x10, [x2, #16]
+; ALL-NEXT:    orr x10, x12, x13
+; ALL-NEXT:    orr x8, x11, x9
+; ALL-NEXT:    stp x10, x8, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -199,39 +198,39 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldp x9, x8, [x0, #16]
-; ALL-NEXT:    movi v0.2d, #0000000000000000
-; ALL-NEXT:    ldr x10, [x1]
-; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    stp x9, x8, [sp, #48]
-; ALL-NEXT:    lsr x8, x10, #3
-; ALL-NEXT:    mov x9, sp
-; ALL-NEXT:    add x9, x9, #32
-; ALL-NEXT:    stp q0, q1, [sp, #16]
-; ALL-NEXT:    and x12, x10, #0x3f
-; ALL-NEXT:    and x8, x8, #0x18
-; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    ldp x10, x8, [x0, #16]
+; ALL-NEXT:    movi v1.2d, #0000000000000000
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    lsr x11, x9, #3
+; ALL-NEXT:    stp x10, x8, [sp, #48]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    stp q1, q0, [sp, #16]
+; ALL-NEXT:    and x12, x9, #0x3f
+; ALL-NEXT:    and x10, x11, #0x18
+; ALL-NEXT:    str q1, [sp]
 ; ALL-NEXT:    eor x12, x12, #0x3f
-; ALL-NEXT:    sub x8, x9, x8
+; ALL-NEXT:    sub x8, x8, x10
 ; ALL-NEXT:    ldp x11, x13, [x8, #16]
-; ALL-NEXT:    ldr x9, [x8]
+; ALL-NEXT:    ldr x10, [x8]
 ; ALL-NEXT:    ldr x8, [x8, #8]
-; ALL-NEXT:    lsr x15, x9, #1
-; ALL-NEXT:    lsl x9, x9, x10
+; ALL-NEXT:    lsr x15, x10, #1
+; ALL-NEXT:    lsl x10, x10, x9
 ; ALL-NEXT:    lsr x14, x11, #1
-; ALL-NEXT:    lsl x11, x11, x10
-; ALL-NEXT:    lsl x13, x13, x10
+; ALL-NEXT:    lsl x11, x11, x9
+; ALL-NEXT:    lsl x13, x13, x9
 ; ALL-NEXT:    lsr x14, x14, x12
 ; ALL-NEXT:    lsr x12, x15, x12
 ; ALL-NEXT:    lsr x15, x8, #1
-; ALL-NEXT:    lsl x8, x8, x10
-; ALL-NEXT:    mvn w10, w10
-; ALL-NEXT:    lsr x10, x15, x10
+; ALL-NEXT:    lsl x8, x8, x9
+; ALL-NEXT:    mvn w9, w9
+; ALL-NEXT:    lsr x9, x15, x9
 ; ALL-NEXT:    orr x8, x8, x12
-; ALL-NEXT:    stp x9, x8, [x2]
-; ALL-NEXT:    orr x9, x13, x14
-; ALL-NEXT:    orr x8, x11, x10
-; ALL-NEXT:    stp x8, x9, [x2, #16]
+; ALL-NEXT:    stp x10, x8, [x2]
+; ALL-NEXT:    orr x10, x13, x14
+; ALL-NEXT:    orr x8, x11, x9
+; ALL-NEXT:    stp x8, x10, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -243,21 +242,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldr q0, [x0]
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
-; ALL-NEXT:    mov x11, sp
 ; ALL-NEXT:    ldr x10, [x1]
-; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    str q0, [sp, #-64]!
+; ALL-NEXT:    asr x11, x8, #63
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
-; ALL-NEXT:    lsr x9, x10, #3
-; ALL-NEXT:    asr x8, x8, #63
-; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    lsr x8, x10, #3
+; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    and x12, x10, #0x3f
-; ALL-NEXT:    and x9, x9, #0x18
-; ALL-NEXT:    stp x8, x8, [sp, #48]
+; ALL-NEXT:    and x8, x8, #0x18
+; ALL-NEXT:    stp x11, x11, [sp, #48]
 ; ALL-NEXT:    eor x12, x12, #0x3f
-; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    add x8, x11, x9
+; ALL-NEXT:    stp x11, x11, [sp, #32]
+; ALL-NEXT:    add x8, x9, x8
 ; ALL-NEXT:    ldp x13, x11, [x8]
 ; ALL-NEXT:    ldr x9, [x8, #24]
 ; ALL-NEXT:    ldr x8, [x8, #16]
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 6db3fb930b94e..1a29b57986325 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -384,15 +384,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS-NEXT:    sra $1, $4, 31
 ; MIPS-NEXT:    sw $7, 28($sp)
-; MIPS-NEXT:    sw $6, 24($sp)
 ; MIPS-NEXT:    sw $5, 20($sp)
-; MIPS-NEXT:    sw $4, 16($sp)
 ; MIPS-NEXT:    sw $1, 12($sp)
-; MIPS-NEXT:    sw $1, 8($sp)
 ; MIPS-NEXT:    sw $1, 4($sp)
+; MIPS-NEXT:    addiu $2, $sp, 0
+; MIPS-NEXT:    sw $6, 24($sp)
+; MIPS-NEXT:    sw $4, 16($sp)
+; MIPS-NEXT:    sw $1, 8($sp)
 ; MIPS-NEXT:    sw $1, 0($sp)
-; MIPS-NEXT:    addiu $1, $sp, 0
-; MIPS-NEXT:    addiu $1, $1, 16
+; MIPS-NEXT:    addiu $1, $2, 16
 ; MIPS-NEXT:    lw $2, 60($sp)
 ; MIPS-NEXT:    srl $3, $2, 3
 ; MIPS-NEXT:    andi $3, $3, 12
@@ -425,15 +425,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32-NEXT:    sra $1, $4, 31
 ; MIPS32-NEXT:    sw $7, 28($sp)
-; MIPS32-NEXT:    sw $6, 24($sp)
 ; MIPS32-NEXT:    sw $5, 20($sp)
-; MIPS32-NEXT:    sw $4, 16($sp)
 ; MIPS32-NEXT:    sw $1, 12($sp)
-; MIPS32-NEXT:    sw $1, 8($sp)
 ; MIPS32-NEXT:    sw $1, 4($sp)
+; MIPS32-NEXT:    addiu $2, $sp, 0
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    sw $4, 16($sp)
+; MIPS32-NEXT:    sw $1, 8($sp)
 ; MIPS32-NEXT:    sw $1, 0($sp)
-; MIPS32-NEXT:    addiu $1, $sp, 0
-; MIPS32-NEXT:    addiu $1, $1, 16
+; MIPS32-NEXT:    addiu $1, $2, 16
 ; MIPS32-NEXT:    lw $2, 60($sp)
 ; MIPS32-NEXT:    srl $3, $2, 3
 ; MIPS32-NEXT:    andi $3, $3, 12
@@ -466,15 +466,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R2-NEXT:    .cfi_def_cfa_offset 32
 ; 32R2-NEXT:    sra $1, $4, 31
 ; 32R2-NEXT:    sw $7, 28($sp)
-; 32R2-NEXT:    sw $6, 24($sp)
 ; 32R2-NEXT:    sw $5, 20($sp)
-; 32R2-NEXT:    sw $4, 16($sp)
 ; 32R2-NEXT:    sw $1, 12($sp)
-; 32R2-NEXT:    sw $1, 8($sp)
 ; 32R2-NEXT:    sw $1, 4($sp)
+; 32R2-NEXT:    addiu $2, $sp, 0
+; 32R2-NEXT:    sw $6, 24($sp)
+; 32R2-NEXT:    sw $4, 16($sp)
+; 32R2-NEXT:    sw $1, 8($sp)
 ; 32R2-NEXT:    sw $1, 0($sp)
-; 32R2-NEXT:    addiu $1, $sp, 0
-; 32R2-NEXT:    addiu $1, $1, 16
+; 32R2-NEXT:    addiu $1, $2, 16
 ; 32R2-NEXT:    lw $2, 60($sp)
 ; 32R2-NEXT:    srl $3, $2, 3
 ; 32R2-NEXT:    andi $3, $3, 12
@@ -507,15 +507,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R6-NEXT:    .cfi_def_cfa_offset 32
 ; 32R6-NEXT:    sra $1, $4, 31
 ; 32R6-NEXT:    sw $7, 28($sp)
-; 32R6-NEXT:    sw $6, 24($sp)
 ; 32R6-NEXT:    sw $5, 20($sp)
-; 32R6-NEXT:    sw $4, 16($sp)
 ; 32R6-NEXT:    sw $1, 12($sp)
-; 32R6-NEXT:    sw $1, 8($sp)
 ; 32R6-NEXT:    sw $1, 4($sp)
+; 32R6-NEXT:    addiu $2, $sp, 0
+; 32R6-NEXT:    sw $6, 24($sp)
+; 32R6-NEXT:    sw $4, 16($sp)
+; 32R6-NEXT:    sw $1, 8($sp)
 ; 32R6-NEXT:    sw $1, 0($sp)
-; 32R6-NEXT:    addiu $1, $sp, 0
-; 32R6-NEXT:    addiu $1, $1, 16
+; 32R6-NEXT:    addiu $1, $2, 16
 ; 32R6-NEXT:    lw $2, 60($sp)
 ; 32R6-NEXT:    srl $3, $2, 3
 ; 32R6-NEXT:    andi $3, $3, 12
@@ -619,13 +619,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
 ; MMR3-NEXT:    sra $1, $4, 31
-; MMR3-NEXT:    swp $6, 24($sp)
-; MMR3-NEXT:    swp $4, 16($sp)
+; MMR3-NEXT:    sw $7, 28($sp)
+; MMR3-NEXT:    sw $5, 20($sp)
 ; MMR3-NEXT:    sw $1, 12($sp)
-; MMR3-NEXT:    sw $1, 8($sp)
 ; MMR3-NEXT:    sw $1, 4($sp)
-; MMR3-NEXT:    sw $1, 0($sp)
 ; MMR3-NEXT:    addiur1sp $2, 0
+; MMR3-NEXT:    sw $6, 24($sp)
+; MMR3-NEXT:    sw $4, 16($sp)
+; MMR3-NEXT:    sw $1, 8($sp)
+; MMR3-NEXT:    sw $1, 0($sp)
 ; MMR3-NEXT:    addiur2 $2, $2, 16
 ; MMR3-NEXT:    lw $3, 68($sp)
 ; MMR3-NEXT:    srl16 $4, $3, 3
@@ -660,15 +662,15 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 16, -4
 ; MMR6-NEXT:    sra $1, $4, 31
-; MMR6-NEXT:    sw $7, 32($sp)
-; MMR6-NEXT:    sw $6, 28($sp)
-; MMR6-NEXT:    sw $5, 24($sp)
-; MMR6-NEXT:    sw $4, 20($sp)
-; MMR6-NEXT:    sw $1, 16($sp)
+; MMR6-NEXT:    sw $7, 28($sp)
+; MMR6-NEXT:    sw $5, 20($sp)
 ; MMR6-NEXT:    sw $1, 12($sp)
-; MMR6-NEXT:    sw $1, 8($sp)
 ; MMR6-NEXT:    sw $1, 4($sp)
-; MMR6-NEXT:    addiu $2, $sp, 4
+; MMR6-NEXT:    addiu $2, $sp, 0
+; MMR6-NEXT:    sw $6, 24($sp)
+; MMR6-NEXT:    sw $4, 16($sp)
+; MMR6-NEXT:    sw $1, 8($sp)
+; MMR6-NEXT:    sw $1, 0($sp)
 ; MMR6-NEXT:    addiur2 $2, $2, 16
 ; MMR6-NEXT:    lw $3, 68($sp)
 ; MMR6-NEXT:    srl16 $4, $3, 3
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index fa10293c0f6fb..7db14fc506e79 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,39 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2:       # %bb.0: # %entry
 ; MIPS2-NEXT:    addiu $sp, $sp, -32
 ; MIPS2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS2-NEXT:    addiu $1, $sp, 0
+; MIPS2-NEXT:    lw $1, 60($sp)
+; MIPS2-NEXT:    addiu $2, $sp, 0
 ; MIPS2-NEXT:    sw $7, 28($sp)
-; MIPS2-NEXT:    sw $6, 24($sp)
 ; MIPS2-NEXT:    sw $5, 20($sp)
-; MIPS2-NEXT:    sw $4, 16($sp)
-; MIPS2-NEXT:    addiu $1, $1, 16
-; MIPS2-NEXT:    lw $2, 60($sp)
-; MIPS2-NEXT:    srl $3, $2, 3
+; MIPS2-NEXT:    addiu $2, $2, 16
+; MIPS2-NEXT:    srl $3, $1, 3
 ; MIPS2-NEXT:    andi $3, $3, 12
-; MIPS2-NEXT:    subu $1, $1, $3
+; MIPS2-NEXT:    sw $6, 24($sp)
+; MIPS2-NEXT:    sw $4, 16($sp)
+; MIPS2-NEXT:    subu $2, $2, $3
 ; MIPS2-NEXT:    sw $zero, 12($sp)
-; MIPS2-NEXT:    sw $zero, 8($sp)
 ; MIPS2-NEXT:    sw $zero, 4($sp)
+; MIPS2-NEXT:    lw $3, 4($2)
+; MIPS2-NEXT:    sw $zero, 8($sp)
 ; MIPS2-NEXT:    sw $zero, 0($sp)
-; MIPS2-NEXT:    lw $3, 4($1)
-; MIPS2-NEXT:    lw $5, 8($1)
-; MIPS2-NEXT:    srlv $4, $5, $2
+; MIPS2-NEXT:    lw $5, 8($2)
+; MIPS2-NEXT:    srlv $4, $5, $1
 ; MIPS2-NEXT:    sll $6, $3, 1
-; MIPS2-NEXT:    andi $7, $2, 31
+; MIPS2-NEXT:    andi $7, $1, 31
 ; MIPS2-NEXT:    xori $7, $7, 31
 ; MIPS2-NEXT:    sllv $6, $6, $7
-; MIPS2-NEXT:    srlv $3, $3, $2
-; MIPS2-NEXT:    lw $8, 0($1)
+; MIPS2-NEXT:    srlv $3, $3, $1
+; MIPS2-NEXT:    lw $8, 0($2)
 ; MIPS2-NEXT:    sll $9, $8, 1
 ; MIPS2-NEXT:    sllv $9, $9, $7
 ; MIPS2-NEXT:    or $3, $3, $9
 ; MIPS2-NEXT:    or $4, $4, $6
-; MIPS2-NEXT:    lw $1, 12($1)
-; MIPS2-NEXT:    srlv $1, $1, $2
+; MIPS2-NEXT:    lw $2, 12($2)
+; MIPS2-NEXT:    srlv $2, $2, $1
 ; MIPS2-NEXT:    sll $5, $5, 1
 ; MIPS2-NEXT:    sllv $5, $5, $7
-; MIPS2-NEXT:    or $5, $1, $5
-; MIPS2-NEXT:    srlv $2, $8, $2
+; MIPS2-NEXT:    or $5, $2, $5
+; MIPS2-NEXT:    srlv $2, $8, $1
 ; MIPS2-NEXT:    jr $ra
 ; MIPS2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -438,39 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    addiu $1, $sp, 0
+; MIPS32-NEXT:    lw $1, 60($sp)
+; MIPS32-NEXT:    addiu $2, $sp, 0
 ; MIPS32-NEXT:    sw $7, 28($sp)
-; MIPS32-NEXT:    sw $6, 24($sp)
 ; MIPS32-NEXT:    sw $5, 20($sp)
-; MIPS32-NEXT:    sw $4, 16($sp)
-; MIPS32-NEXT:    addiu $1, $1, 16
-; MIPS32-NEXT:    lw $2, 60($sp)
-; MIPS32-NEXT:    srl $3, $2, 3
+; MIPS32-NEXT:    addiu $2, $2, 16
+; MIPS32-NEXT:    srl $3, $1, 3
 ; MIPS32-NEXT:    andi $3, $3, 12
-; MIPS32-NEXT:    subu $1, $1, $3
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    sw $4, 16($sp)
+; MIPS32-NEXT:    subu $2, $2, $3
 ; MIPS32-NEXT:    sw $zero, 12($sp)
-; MIPS32-NEXT:    sw $zero, 8($sp)
 ; MIPS32-NEXT:    sw $zero, 4($sp)
+; MIPS32-NEXT:    lw $3, 4($2)
+; MIPS32-NEXT:    sw $zero, 8($sp)
 ; MIPS32-NEXT:    sw $zero, 0($sp)
-; MIPS32-NEXT:    lw $3, 4($1)
-; MIPS32-NEXT:    lw $5, 8($1)
-; MIPS32-NEXT:    srlv $4, $5, $2
+; MIPS32-NEXT:    lw $5, 8($2)
+; MIPS32-NEXT:    srlv $4, $5, $1
 ; MIPS32-NEXT:    sll $6, $3, 1
-; MIPS32-NEXT:    andi $7, $2, 31
+; MIPS32-NEXT:    andi $7, $1, 31
 ; MIPS32-NEXT:    xori $7, $7, 31
 ; MIPS32-NEXT:    sllv $6, $6, $7
-; MIPS32-NEXT:    srlv $3, $3, $2
-; MIPS32-NEXT:    lw $8, 0($1)
+; MIPS32-NEXT:    srlv $3, $3, $1
+; MIPS32-NEXT:    lw $8, 0($2)
 ; MIPS32-NEXT:    sll $9, $8, 1
 ; MIPS32-NEXT:    sllv $9, $9, $7
 ; MIPS32-NEXT:    or $3, $3, $9
 ; MIPS32-NEXT:    or $4, $4, $6
-; MIPS32-NEXT:    lw $1, 12($1)
-; MIPS32-NEXT:    srlv $1, $1, $2
+; MIPS32-NEXT:    lw $2, 12($2)
+; MIPS32-NEXT:    srlv $2, $2, $1
 ; MIPS32-NEXT:    sll $5, $5, 1
 ; MIPS32-NEXT:    sllv $5, $5, $7
-; MIPS32-NEXT:    or $5, $1, $5
-; MIPS32-NEXT:    srlv $2, $8, $2
+; MIPS32-NEXT:    or $5, $2, $5
+; MIPS32-NEXT:    srlv $2, $8, $1
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
@@ -478,39 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT:    addiu $1, $sp, 0
+; MIPS32R2-NEXT:    lw $1, 60($sp)
+; MIPS32R2-NEXT:    addiu $2, $sp, 0
 ; MIPS32R2-NEXT:    sw $7, 28($sp)
-; MIPS32R2-NEXT:    sw $6, 24($sp)
 ; MIPS32R2-NEXT:    sw $5, 20($sp)
-; MIPS32R2-NEXT:    sw $4, 16($sp)
-; MIPS32R2-NEXT:    addiu $1, $1, 16
-; MIPS32R2-NEXT:    lw $2, 60($sp)
-; MIPS32R2-NEXT:    srl $3, $2, 3
+; MIPS32R2-NEXT:    addiu $2, $2, 16
+; MIPS32R2-NEXT:    srl $3, $1, 3
 ; MIPS32R2-NEXT:    andi $3, $3, 12
-; MIPS32R2-NEXT:    subu $1, $1, $3
+; MIPS32R2-NEXT:    sw $6, 24($sp)
+; MIPS32R2-NEXT:    sw $4, 16($sp)
+; MIPS32R2-NEXT:    subu $2, $2, $3
 ; MIPS32R2-NEXT:    sw $zero, 12($sp)
-; MIPS32R2-NEXT:    sw $zero, 8($sp)
 ; MIPS32R2-NEXT:    sw $zero, 4($sp)
+; MIPS32R2-NEXT:    lw $3, 4($2)
+; MIPS32R2-NEXT:    sw $zero, 8($sp)
 ; MIPS32R2-NEXT:    sw $zero, 0($sp)
-; MIPS32R2-NEXT:    lw $3, 4($1)
-; MIPS32R2-NEXT:    lw $5, 8($1)
-; MIPS32R2-NEXT:    srlv $4, $5, $2
+; MIPS32R2-NEXT:    lw $5, 8($2)
+; MIPS32R2-NEXT:    srlv $4, $5, $1
 ; MIPS32R2-NEXT:    sll $6, $3, 1
-; MIPS32R2-NEXT:    andi $7, $2, 31
+; MIPS32R2-NEXT:    andi $7, $1, 31
 ; MIPS32R2-NEXT:    xori $7, $7, 31
 ; MIPS32R2-NEXT:    sllv $6, $6, $7
-; MIPS32R2-NEXT:    srlv $3, $3, $2
-; MIPS32R2-NEXT:    lw $8, 0($1)
+; MIPS32R2-NEXT:    srlv $3, $3, $1
+; MIPS32R2-NEXT:    lw $8, 0($2)
 ; MIPS32R2-NEXT:    sll $9, $8, 1
 ; MIPS32R2-NEXT:    sllv $9, $9, $7
 ; MIPS32R2-NEXT:    or $3, $3, $9
 ; MIPS32R2-NEXT:    or $4, $4, $6
-; MIPS32R2-NEXT:    lw $1, 12($1)
-; MIPS32R2-NEXT:    srlv $1, $1, $2
+; MIPS32R2-NEXT:    lw $2, 12($2)
+; MIPS32R2-NEXT:    srlv $2, $2, $1
 ; MIPS32R2-NEXT:    sll $5, $5, 1
 ; MIPS32R2-NEXT:    sllv $5, $5, $7
-; MIPS32R2-NEXT:    or $5, $1, $5
-; MIPS32R2-NEXT:    srlv $2, $8, $2
+; MIPS32R2-NEXT:    or $5, $2, $5
+; MIPS32R2-NEXT:    srlv $2, $8, $1
 ; MIPS32R2-NEXT:    jr $ra
 ; MIPS32R2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -518,39 +518,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6:       # %bb.0: # %entry
 ; MIPS32R6-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R6-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R6-NEXT:    addiu $1, $sp, 0
+; MIPS32R6-NEXT:    lw $1, 60($sp)
+; MIPS32R6-NEXT:    addiu $2, $sp, 0
 ; MIPS32R6-NEXT:    sw $7, 28($sp)
-; MIPS32R6-NEXT:    sw $6, 24($sp)
 ; MIPS32R6-NEXT:    sw $5, 20($sp)
-; MIPS32R6-NEXT:    sw $4, 16($sp)
-; MIPS32R6-NEXT:    addiu $1, $1, 16
-; MIPS32R6-NEXT:    lw $2, 60($sp)
-; MIPS32R6-NEXT:    srl $3, $2, 3
+; MIPS32R6-NEXT:    addiu $2, $2, 16
+; MIPS32R6-NEXT:    srl $3, $1, 3
 ; MIPS32R6-NEXT:    andi $3, $3, 12
-; MIPS32R6-NEXT:    subu $1, $1, $3
+; MIPS32R6-NEXT:    sw $6, 24($sp)
+; MIPS32R6-NEXT:    sw $4, 16($sp)
+; MIPS32R6-NEXT:    subu $2, $2, $3
 ; MIPS32R6-NEXT:    sw $zero, 12($sp)
-; MIPS32R6-NEXT:    sw $zero, 8($sp)
 ; MIPS32R6-NEXT:    sw $zero, 4($sp)
+; MIPS32R6-NEXT:    lw $3, 4($2)
+; MIPS32R6-NEXT:    sw $zero, 8($sp)
 ; MIPS32R6-NEXT:    sw $zero, 0($sp)
-; MIPS32R6-NEXT:    lw $3, 4($1)
-; MIPS32R6-NEXT:    lw $5, 8($1)
-; MIPS32R6-NEXT:    srlv $4, $5, $2
+; MIPS32R6-NEXT:    lw $5, 8($2)
+; MIPS32R6-NEXT:    srlv $4, $5, $1
 ; MIPS32R6-NEXT:    sll $6, $3, 1
-; MIPS32R6-NEXT:    andi $7, $2, 31
+; MIPS32R6-NEXT:    andi $7, $1, 31
 ; MIPS32R6-NEXT:    xori $7, $7, 31
 ; MIPS32R6-NEXT:    sllv $6, $6, $7
-; MIPS32R6-NEXT:    srlv $3, $3, $2
-; MIPS32R6-NEXT:    lw $8, 0($1)
+; MIPS32R6-NEXT:    srlv $3, $3, $1
+; MIPS32R6-NEXT:    lw $8, 0($2)
 ; MIPS32R6-NEXT:    sll $9, $8, 1
 ; MIPS32R6-NEXT:    sllv $9, $9, $7
 ; MIPS32R6-NEXT:    or $3, $3, $9
 ; MIPS32R6-NEXT:    or $4, $4, $6
-; MIPS32R6-NEXT:    lw $1, 12($1)
-; MIPS32R6-NEXT:    srlv $1, $1, $2
+; MIPS32R6-NEXT:    lw $2, 12($2)
+; MIPS32R6-NEXT:    srlv $2, $2, $1
 ; MIPS32R6-NEXT:    sll $5, $5, 1
 ; MIPS32R6-NEXT:    sllv $5, $5, $7
-; MIPS32R6-NEXT:    or $5, $1, $5
-; MIPS32R6-NEXT:    srlv $2, $8, $2
+; MIPS32R6-NEXT:    or $5, $2, $5
+; MIPS32R6-NEXT:    srlv $2, $8, $1
 ; MIPS32R6-NEXT:    jr $ra
 ; MIPS32R6-NEXT:    addiu $sp, $sp, 32
 ;
@@ -639,14 +639,16 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    swp $6, 24($sp)
-; MMR3-NEXT:    swp $4, 16($sp)
-; MMR3-NEXT:    sw $2, 12($sp)
-; MMR3-NEXT:    sw $2, 8($sp)
-; MMR3-NEXT:    sw $2, 4($sp)
-; MMR3-NEXT:    sw $2, 0($sp)
 ; MMR3-NEXT:    addiur1sp $2, 0
+; MMR3-NEXT:    li16 $3, 0
+; MMR3-NEXT:    sw $7, 28($sp)
+; MMR3-NEXT:    sw $5, 20($sp)
+; MMR3-NEXT:    sw $3, 12($sp)
+; MMR3-NEXT:    sw $3, 4($sp)
+; MMR3-NEXT:    sw $6, 24($sp)
+; MMR3-NEXT:    sw $4, 16($sp)
+; MMR3-NEXT:    sw $3, 8($sp)
+; MMR3-NEXT:    sw $3, 0($sp)
 ; MMR3-NEXT:    addiur2 $2, $2, 16
 ; MMR3-NEXT:    lw $3, 68($sp)
 ; MMR3-NEXT:    srl16 $4, $3, 3
@@ -680,16 +682,16 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    .cfi_def_cfa_offset 40
 ; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 16, -4
-; MMR6-NEXT:    li16 $2, 0
-; MMR6-NEXT:    sw $7, 32($sp)
-; MMR6-NEXT:    sw $6, 28($sp)
-; MMR6-NEXT:    sw $5, 24($sp)
-; MMR6-NEXT:    sw $4, 20($sp)
-; MMR6-NEXT:    sw $2, 16($sp)
-; MMR6-NEXT:    sw $2, 12($sp)
-; MMR6-NEXT:    sw $2, 8($sp)
-; MMR6-NEXT:    sw $2, 4($sp)
-; MMR6-NEXT:    addiu $2, $sp, 4
+; MMR6-NEXT:    addiu $2, $sp, 0
+; MMR6-NEXT:    li16 $3, 0
+; MMR6-NEXT:    sw $7, 28($sp)
+; MMR6-NEXT:    sw $5, 20($sp)
+; MMR6-NEXT:    sw $3, 12($sp)
+; MMR6-NEXT:    sw $3, 4($sp)
+; MMR6-NEXT:    sw $6, 24($sp)
+; MMR6-NEXT:    sw $4, 16($sp)
+; MMR6-NEXT:    sw $3, 8($sp)
+; MMR6-NEXT:    sw $3, 0($sp)
 ; MMR6-NEXT:    addiur2 $2, $2, 16
 ; MMR6-NEXT:    lw $3, 68($sp)
 ; MMR6-NEXT:    srl16 $4, $3, 3
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 394890a9dcc7c..5320f6d8a4353 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -442,18 +442,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS2-NEXT:    lw $1, 60($sp)
 ; MIPS2-NEXT:    srl $2, $1, 3
-; MIPS2-NEXT:    sw $7, 12($sp)
 ; MIPS2-NEXT:    sw $6, 8($sp)
-; MIPS2-NEXT:    sw $5, 4($sp)
 ; MIPS2-NEXT:    sw $4, 0($sp)
 ; MIPS2-NEXT:    andi $2, $2, 12
 ; MIPS2-NEXT:    addiu $3, $sp, 0
+; MIPS2-NEXT:    sw $7, 12($sp)
+; MIPS2-NEXT:    sw $5, 4($sp)
 ; MIPS2-NEXT:    addu $4, $3, $2
-; MIPS2-NEXT:    sw $zero, 28($sp)
 ; MIPS2-NEXT:    sw $zero, 24($sp)
-; MIPS2-NEXT:    sw $zero, 20($sp)
 ; MIPS2-NEXT:    sw $zero, 16($sp)
 ; MIPS2-NEXT:    lw $5, 8($4)
+; MIPS2-NEXT:    sw $zero, 28($sp)
+; MIPS2-NEXT:    sw $zero, 20($sp)
 ; MIPS2-NEXT:    lw $2, 4($4)
 ; MIPS2-NEXT:    sllv $3, $2, $1
 ; MIPS2-NEXT:    srl $6, $5, 1
@@ -481,18 +481,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32-NEXT:    lw $1, 60($sp)
 ; MIPS32-NEXT:    srl $2, $1, 3
-; MIPS32-NEXT:    sw $7, 12($sp)
 ; MIPS32-NEXT:    sw $6, 8($sp)
-; MIPS32-NEXT:    sw $5, 4($sp)
 ; MIPS32-NEXT:    sw $4, 0($sp)
 ; MIPS32-NEXT:    andi $2, $2, 12
 ; MIPS32-NEXT:    addiu $3, $sp, 0
+; MIPS32-NEXT:    sw $7, 12($sp)
+; MIPS32-NEXT:    sw $5, 4($sp)
 ; MIPS32-NEXT:    addu $4, $3, $2
-; MIPS32-NEXT:    sw $zero, 28($sp)
 ; MIPS32-NEXT:    sw $zero, 24($sp)
-; MIPS32-NEXT:    sw $zero, 20($sp)
 ; MIPS32-NEXT:    sw $zero, 16($sp)
 ; MIPS32-NEXT:    lw $5, 8($4)
+; MIPS32-NEXT:    sw $zero, 28($sp)
+; MIPS32-NEXT:    sw $zero, 20($sp)
 ; MIPS32-NEXT:    lw $2, 4($4)
 ; MIPS32-NEXT:    sllv $3, $2, $1
 ; MIPS32-NEXT:    srl $6, $5, 1
@@ -520,18 +520,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R2-NEXT:    lw $1, 60($sp)
 ; MIPS32R2-NEXT:    srl $2, $1, 3
-; MIPS32R2-NEXT:    sw $7, 12($sp)
 ; MIPS32R2-NEXT:    sw $6, 8($sp)
-; MIPS32R2-NEXT:    sw $5, 4($sp)
 ; MIPS32R2-NEXT:    sw $4, 0($sp)
 ; MIPS32R2-NEXT:    andi $2, $2, 12
 ; MIPS32R2-NEXT:    addiu $3, $sp, 0
+; MIPS32R2-NEXT:    sw $7, 12($sp)
+; MIPS32R2-NEXT:    sw $5, 4($sp)
 ; MIPS32R2-NEXT:    addu $4, $3, $2
-; MIPS32R2-NEXT:    sw $zero, 28($sp)
 ; MIPS32R2-NEXT:    sw $zero, 24($sp)
-; MIPS32R2-NEXT:    sw $zero, 20($sp)
 ; MIPS32R2-NEXT:    sw $zero, 16($sp)
 ; MIPS32R2-NEXT:    lw $5, 8($4)
+; MIPS32R2-NEXT:    sw $zero, 28($sp)
+; MIPS32R2-NEXT:    sw $zero, 20($sp)
 ; MIPS32R2-NEXT:    lw $2, 4($4)
 ; MIPS32R2-NEXT:    sllv $3, $2, $1
 ; MIPS32R2-NEXT:    srl $6, $5, 1
@@ -559,18 +559,18 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R6-NEXT:    lw $1, 60($sp)
 ; MIPS32R6-NEXT:    srl $2, $1, 3
-; MIPS32R6-NEXT:    sw $7, 12($sp)
 ; MIPS32R6-NEXT:    sw $6, 8($sp)
-; MIPS32R6-NEXT:    sw $5, 4($sp)
 ; MIPS32R6-NEXT:    sw $4, 0($sp)
 ; MIPS32R6-NEXT:    andi $2, $2, 12
 ; MIPS32R6-NEXT:    addiu $3, $sp, 0
+; MIPS32R6-NEXT:    sw $7, 12($sp)
+; MIPS32R6-NEXT:    sw $5, 4($sp)
 ; MIPS32R6-NEXT:    addu $4, $3, $2
-; MIPS32R6-NEXT:    sw $zero, 28($sp)
 ; MIPS32R6-NEXT:    sw $zero, 24($sp)
-; MIPS32R6-NEXT:    sw $zero, 20($sp)
 ; MIPS32R6-NEXT:    sw $zero, 16($sp)
 ; MIPS32R6-NEXT:    lw $5, 8($4)
+; MIPS32R6-NEXT:    sw $zero, 28($sp)
+; MIPS32R6-NEXT:    sw $zero, 20($sp)
 ; MIPS32R6-NEXT:    lw $2, 4($4)
 ; MIPS32R6-NEXT:    sllv $3, $2, $1
 ; MIPS32R6-NEXT:    srl $6, $5, 1
@@ -680,82 +680,86 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    addiusp -40
 ; MMR3-NEXT:    .cfi_def_cfa_offset 40
-; MMR3-NEXT:    swp $16, 32($sp)
-; MMR3-NEXT:    .cfi_offset 17, -4
-; MMR3-NEXT:    .cfi_offset 16, -8
+; MMR3-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR3-NEXT:    .cfi_offset 16, -4
 ; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    sw $2, 28($sp)
+; MMR3-NEXT:    sw $2, 16($sp)
+; MMR3-NEXT:    lw $3, 68($sp)
 ; MMR3-NEXT:    sw $2, 24($sp)
+; MMR3-NEXT:    sw $6, 8($sp)
+; MMR3-NEXT:    sw $4, 0($sp)
+; MMR3-NEXT:    sw $2, 28($sp)
 ; MMR3-NEXT:    sw $2, 20($sp)
-; MMR3-NEXT:    sw $2, 16($sp)
-; MMR3-NEXT:    swp $6, 8($sp)
-; MMR3-NEXT:    swp $4, 0($sp)
-; MMR3-NEXT:    lw $2, 68($sp)
-; MMR3-NEXT:    srl16 $3, $2, 3
-; MMR3-NEXT:    andi $3, $3, 12
+; MMR3-NEXT:    sw $7, 12($sp)
+; MMR3-NEXT:    sw $5, 4($sp)
+; MMR3-NEXT:    srl16 $2, $3, 3
+; MMR3-NEXT:    andi $2, $2, 12
 ; MMR3-NEXT:    addiur1sp $4, 0
-; MMR3-NEXT:    addu16 $4, $4, $3
+; MMR3-NEXT:    addu16 $4, $4, $2
 ; MMR3-NEXT:    lw16 $6, 8($4)
-; MMR3-NEXT:    lw16 $7, 4($4)
-; MMR3-NEXT:    andi16 $5, $2, 31
-; MMR3-NEXT:    sllv $16, $7, $5
-; MMR3-NEXT:    srl16 $2, $6, 1
+; MMR3-NEXT:    lw16 $2, 4($4)
+; MMR3-NEXT:    andi16 $5, $3, 31
+; MMR3-NEXT:    sllv $7, $2, $5
+; MMR3-NEXT:    srl16 $3, $6, 1
 ; MMR3-NEXT:    xori $1, $5, 31
-; MMR3-NEXT:    srlv $3, $2, $1
-; MMR3-NEXT:    lw16 $2, 0($4)
-; MMR3-NEXT:    sllv $17, $2, $5
-; MMR3-NEXT:    srl16 $2, $7, 1
+; MMR3-NEXT:    srlv $3, $3, $1
+; MMR3-NEXT:    lw16 $16, 0($4)
+; MMR3-NEXT:    sllv $16, $16, $5
+; MMR3-NEXT:    srl16 $2, $2, 1
 ; MMR3-NEXT:    srlv $2, $2, $1
-; MMR3-NEXT:    or16 $2, $17
-; MMR3-NEXT:    or16 $3, $16
+; MMR3-NEXT:    or16 $2, $16
+; MMR3-NEXT:    or16 $3, $7
 ; MMR3-NEXT:    sllv $6, $6, $5
 ; MMR3-NEXT:    lw16 $7, 12($4)
 ; MMR3-NEXT:    srl16 $4, $7, 1
 ; MMR3-NEXT:    srlv $4, $4, $1
 ; MMR3-NEXT:    or16 $4, $6
 ; MMR3-NEXT:    sllv $5, $7, $5
-; MMR3-NEXT:    lwp $16, 32($sp)
+; MMR3-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: shl_i128:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    addiu $sp, $sp, -32
-; MMR6-NEXT:    .cfi_def_cfa_offset 32
+; MMR6-NEXT:    addiu $sp, $sp, -40
+; MMR6-NEXT:    .cfi_def_cfa_offset 40
+; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MMR6-NEXT:    .cfi_offset 16, -4
 ; MMR6-NEXT:    li16 $2, 0
-; MMR6-NEXT:    sw $2, 28($sp)
+; MMR6-NEXT:    sw $2, 16($sp)
+; MMR6-NEXT:    lw $3, 68($sp)
 ; MMR6-NEXT:    sw $2, 24($sp)
+; MMR6-NEXT:    sw $6, 8($sp)
+; MMR6-NEXT:    sw $4, 0($sp)
+; MMR6-NEXT:    sw $2, 28($sp)
 ; MMR6-NEXT:    sw $2, 20($sp)
-; MMR6-NEXT:    sw $2, 16($sp)
 ; MMR6-NEXT:    sw $7, 12($sp)
-; MMR6-NEXT:    sw $6, 8($sp)
 ; MMR6-NEXT:    sw $5, 4($sp)
-; MMR6-NEXT:    sw $4, 0($sp)
-; MMR6-NEXT:    lw $2, 60($sp)
-; MMR6-NEXT:    srl16 $3, $2, 3
-; MMR6-NEXT:    andi $3, $3, 12
+; MMR6-NEXT:    srl16 $2, $3, 3
+; MMR6-NEXT:    andi $2, $2, 12
 ; MMR6-NEXT:    addiu $4, $sp, 0
-; MMR6-NEXT:    addu16 $4, $4, $3
+; MMR6-NEXT:    addu16 $4, $4, $2
 ; MMR6-NEXT:    lw16 $5, 8($4)
-; MMR6-NEXT:    lw16 $3, 4($4)
-; MMR6-NEXT:    andi16 $6, $2, 31
-; MMR6-NEXT:    sllv $1, $3, $6
-; MMR6-NEXT:    srl16 $2, $5, 1
+; MMR6-NEXT:    lw16 $2, 4($4)
+; MMR6-NEXT:    andi16 $6, $3, 31
+; MMR6-NEXT:    sllv $1, $2, $6
+; MMR6-NEXT:    srl16 $3, $5, 1
 ; MMR6-NEXT:    xori $7, $6, 31
-; MMR6-NEXT:    srlv $8, $2, $7
-; MMR6-NEXT:    lw16 $2, 0($4)
-; MMR6-NEXT:    sllv $2, $2, $6
-; MMR6-NEXT:    srl16 $3, $3, 1
 ; MMR6-NEXT:    srlv $3, $3, $7
-; MMR6-NEXT:    or $2, $2, $3
-; MMR6-NEXT:    or $3, $1, $8
+; MMR6-NEXT:    lw16 $16, 0($4)
+; MMR6-NEXT:    sllv $8, $16, $6
+; MMR6-NEXT:    srl16 $2, $2, 1
+; MMR6-NEXT:    srlv $2, $2, $7
+; MMR6-NEXT:    or $2, $8, $2
+; MMR6-NEXT:    or $3, $1, $3
 ; MMR6-NEXT:    sllv $1, $5, $6
 ; MMR6-NEXT:    lw16 $5, 12($4)
 ; MMR6-NEXT:    srl16 $4, $5, 1
 ; MMR6-NEXT:    srlv $4, $4, $7
 ; MMR6-NEXT:    or $4, $1, $4
 ; MMR6-NEXT:    sllv $5, $5, $6
-; MMR6-NEXT:    addiu $sp, $sp, 32
+; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
+; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra
 entry:
 
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index ae25feeb8893c..82ddef55eba30 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -13,22 +13,22 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
 ; CHECK-NEXT:    li 6, 0
 ; CHECK-NEXT:    mtctr 7
-; CHECK-NEXT:    addi 7, 1, 20
+; CHECK-NEXT:    addi 7, 1, 16
 ; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 8, 0(4)
-; CHECK-NEXT:    lwz 9, 4(4)
-; CHECK-NEXT:    lwz 10, 8(4)
-; CHECK-NEXT:    lwz 11, 12(4)
+; CHECK-NEXT:    lwz 8, 4(4)
+; CHECK-NEXT:    lwz 9, 12(4)
+; CHECK-NEXT:    lwz 10, 0(4)
+; CHECK-NEXT:    lwz 11, 8(4)
 ; CHECK-NEXT:    lwz 12, 12(5)
-; CHECK-NEXT:    stw 6, 48(1)
-; CHECK-NEXT:    stw 6, 44(1)
 ; CHECK-NEXT:    stw 6, 40(1)
+; CHECK-NEXT:    stw 6, 32(1)
+; CHECK-NEXT:    stw 6, 44(1)
 ; CHECK-NEXT:    stw 6, 36(1)
-; CHECK-NEXT:    stw 11, 32(1)
-; CHECK-NEXT:    stw 10, 28(1)
+; CHECK-NEXT:    stw 11, 24(1)
+; CHECK-NEXT:    stw 10, 16(1)
 ; CHECK-NEXT:    clrlwi 10, 12, 27
-; CHECK-NEXT:    stw 9, 24(1)
+; CHECK-NEXT:    stw 9, 28(1)
 ; CHECK-NEXT:    stw 8, 20(1)
 ; CHECK-NEXT:    rlwinm 8, 12, 29, 28, 29
 ; CHECK-NEXT:    lwzux 9, 8, 7
@@ -84,22 +84,22 @@ define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-NEXT:    addi 6, 1, 24
 ; CHECK-NEXT:  .LBB1_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 7, 0(4)
-; CHECK-NEXT:    lwz 8, 4(4)
+; CHECK-NEXT:    lwz 7, 8(4)
+; CHECK-NEXT:    lwz 8, 0(4)
 ; CHECK-NEXT:    lwz 11, 12(5)
-; CHECK-NEXT:    lwz 9, 8(4)
+; CHECK-NEXT:    lwz 9, 4(4)
 ; CHECK-NEXT:    lwz 10, 12(4)
-; CHECK-NEXT:    stw 8, 28(1)
+; CHECK-NEXT:    stw 7, 32(1)
+; CHECK-NEXT:    srawi 7, 8, 31
+; CHECK-NEXT:    stw 8, 24(1)
 ; CHECK-NEXT:    rlwinm 8, 11, 29, 28, 29
-; CHECK-NEXT:    stw 7, 24(1)
-; CHECK-NEXT:    srawi 7, 7, 31
 ; CHECK-NEXT:    stw 10, 36(1)
 ; CHECK-NEXT:    clrlwi 10, 11, 27
-; CHECK-NEXT:    stw 9, 32(1)
+; CHECK-NEXT:    stw 9, 28(1)
 ; CHECK-NEXT:    subfic 12, 10, 32
 ; CHECK-NEXT:    stw 7, 20(1)
-; CHECK-NEXT:    stw 7, 16(1)
 ; CHECK-NEXT:    stw 7, 12(1)
+; CHECK-NEXT:    stw 7, 16(1)
 ; CHECK-NEXT:    stw 7, 8(1)
 ; CHECK-NEXT:    sub 7, 6, 8
 ; CHECK-NEXT:    lwz 8, 4(7)
@@ -152,24 +152,24 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
 ; CHECK-NEXT:    li 6, 0
 ; CHECK-NEXT:    mtctr 7
-; CHECK-NEXT:    addi 7, 1, 36
+; CHECK-NEXT:    addi 7, 1, 32
 ; CHECK-NEXT:  .LBB2_1: # %for.body
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lwz 8, 0(4)
 ; CHECK-NEXT:    lwz 12, 12(5)
-; CHECK-NEXT:    lwz 9, 4(4)
-; CHECK-NEXT:    lwz 10, 8(4)
+; CHECK-NEXT:    lwz 9, 8(4)
+; CHECK-NEXT:    lwz 10, 4(4)
 ; CHECK-NEXT:    lwz 11, 12(4)
-; CHECK-NEXT:    stw 8, 36(1)
+; CHECK-NEXT:    stw 8, 32(1)
 ; CHECK-NEXT:    rlwinm 8, 12, 29, 28, 29
-; CHECK-NEXT:    stw 6, 32(1)
-; CHECK-NEXT:    sub 8, 7, 8
 ; CHECK-NEXT:    stw 6, 28(1)
-; CHECK-NEXT:    stw 6, 24(1)
+; CHECK-NEXT:    sub 8, 7, 8
 ; CHECK-NEXT:    stw 6, 20(1)
-; CHECK-NEXT:    stw 11, 48(1)
+; CHECK-NEXT:    stw 6, 24(1)
+; CHECK-NEXT:    stw 6, 16(1)
+; CHECK-NEXT:    stw 11, 44(1)
 ; CHECK-NEXT:    clrlwi 11, 12, 27
-; CHECK-NEXT:    stw 10, 44(1)
+; CHECK-NEXT:    stw 10, 36(1)
 ; CHECK-NEXT:    subfic 0, 11, 32
 ; CHECK-NEXT:    stw 9, 40(1)
 ; CHECK-NEXT:    lwz 9, 4(8)
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index cc90300aafcea..8932733db1e40 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -32,20 +32,21 @@ define void @pr59074(ptr %0) {
 ; LE32-NEXT:    li 7, 0
 ; LE32-NEXT:    li 8, 12
 ; LE32-NEXT:    xxswapd 0, 0
+; LE32-NEXT:    rlwimi 5, 6, 0, 30, 28
 ; LE32-NEXT:    addi 4, 4, -12
 ; LE32-NEXT:    rlwinm 9, 4, 29, 28, 29
-; LE32-NEXT:    stxvd2x 0, 6, 5
-; LE32-NEXT:    stw 7, 44(1)
+; LE32-NEXT:    stxvd2x 0, 0, 5
 ; LE32-NEXT:    stw 7, 40(1)
-; LE32-NEXT:    stw 7, 36(1)
 ; LE32-NEXT:    stw 8, 16(1)
+; LE32-NEXT:    stw 7, 44(1)
+; LE32-NEXT:    stw 7, 36(1)
 ; LE32-NEXT:    clrlwi 4, 4, 27
 ; LE32-NEXT:    lwzux 5, 9, 6
-; LE32-NEXT:    subfic 11, 4, 32
 ; LE32-NEXT:    lwz 6, 8(9)
 ; LE32-NEXT:    lwz 7, 4(9)
 ; LE32-NEXT:    lwz 8, 12(9)
 ; LE32-NEXT:    xori 9, 4, 31
+; LE32-NEXT:    subfic 11, 4, 32
 ; LE32-NEXT:    srw 5, 5, 4
 ; LE32-NEXT:    slwi 10, 6, 1
 ; LE32-NEXT:    srw 6, 6, 4
@@ -90,8 +91,8 @@ define void @pr59074(ptr %0) {
 ; BE32-NEXT:    stxvw4x 0, 0, 5
 ; BE32-NEXT:    stw 6, -36(1)
 ; BE32-NEXT:    addi 4, 4, -12
-; BE32-NEXT:    stw 7, -40(1)
 ; BE32-NEXT:    stw 7, -44(1)
+; BE32-NEXT:    stw 7, -40(1)
 ; BE32-NEXT:    stw 7, -48(1)
 ; BE32-NEXT:    rlwinm 9, 4, 29, 28, 29
 ; BE32-NEXT:    clrlwi 4, 4, 27
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
index f6fdb4ae20794..29b91d8ef89dc 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -229,29 +229,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 4(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    addi 3, 1, 32
-; LE-32BIT-NEXT:    clrlwi 4, 4, 28
 ; LE-32BIT-NEXT:    stw 6, 28(1)
-; LE-32BIT-NEXT:    sub 3, 3, 4
-; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 16(1)
-; LE-32BIT-NEXT:    stw 9, 40(1)
-; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 32
+; LE-32BIT-NEXT:    stw 9, 36(1)
+; LE-32BIT-NEXT:    sub 3, 3, 6
+; LE-32BIT-NEXT:    stw 8, 40(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
 ; LE-32BIT-NEXT:    stw 7, 32(1)
-; LE-32BIT-NEXT:    lwz 4, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    lwz 3, 12(3)
-; LE-32BIT-NEXT:    stw 7, 8(5)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    srw 4, 7, 4
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    stw 6, 0(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -301,30 +313,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-LABEL: shl_16bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    lwz 9, 0(3)
+; LE-32BIT-NEXT:    lwz 3, 8(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    stw 6, 44(1)
 ; LE-32BIT-NEXT:    stw 6, 40(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 28
-; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    stw 6, 32(1)
-; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 16
-; LE-32BIT-NEXT:    stw 9, 24(1)
-; LE-32BIT-NEXT:    stw 8, 20(1)
-; LE-32BIT-NEXT:    stw 7, 16(1)
-; LE-32BIT-NEXT:    lwzux 3, 4, 3
-; LE-32BIT-NEXT:    lwz 6, 4(4)
-; LE-32BIT-NEXT:    lwz 7, 12(4)
-; LE-32BIT-NEXT:    lwz 4, 8(4)
+; LE-32BIT-NEXT:    stw 9, 16(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 8, 28(1)
+; LE-32BIT-NEXT:    subfic 8, 4, 32
+; LE-32BIT-NEXT:    stw 7, 20(1)
+; LE-32BIT-NEXT:    lwzux 3, 6, 3
+; LE-32BIT-NEXT:    lwz 9, 4(6)
+; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 7, 8(6)
+; LE-32BIT-NEXT:    lwz 6, 12(6)
+; LE-32BIT-NEXT:    slw 11, 9, 4
+; LE-32BIT-NEXT:    srw 9, 9, 8
+; LE-32BIT-NEXT:    srw 10, 7, 8
+; LE-32BIT-NEXT:    srw 8, 6, 8
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    or 3, 3, 9
+; LE-32BIT-NEXT:    stw 4, 12(5)
+; LE-32BIT-NEXT:    or 4, 7, 8
 ; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
 ; LE-32BIT-NEXT:    stw 4, 8(5)
-; LE-32BIT-NEXT:    stw 7, 12(5)
-; LE-32BIT-NEXT:    stw 6, 4(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -380,31 +404,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-LABEL: ashr_16bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
 ; LE-32BIT-NEXT:    addi 6, 1, 32
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 8, 0(3)
+; LE-32BIT-NEXT:    lwz 9, 4(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    clrlwi 4, 4, 28
-; LE-32BIT-NEXT:    stw 9, 40(1)
-; LE-32BIT-NEXT:    stw 8, 36(1)
-; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    srawi 3, 8, 31
+; LE-32BIT-NEXT:    stw 7, 40(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 0, 28, 29
+; LE-32BIT-NEXT:    stw 9, 36(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 8, 32(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 28(1)
-; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 16(1)
-; LE-32BIT-NEXT:    sub 3, 6, 4
-; LE-32BIT-NEXT:    lwz 4, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    lwz 3, 12(3)
-; LE-32BIT-NEXT:    stw 7, 8(5)
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    sraw 4, 7, 4
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    stw 6, 0(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -422,20 +458,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
 ; LE-64BIT-NEXT:    xxlxor 2, 2, 2
 ; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    li 8, 32
 ; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
 ; LE-64BIT-NEXT:    lwz 3, 0(4)
 ; LE-64BIT-NEXT:    li 4, 48
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
-; LE-64BIT-NEXT:    li 4, 32
-; LE-64BIT-NEXT:    clrldi 3, 3, 59
-; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 8
+; LE-64BIT-NEXT:    rlwinm 4, 3, 0, 27, 28
+; LE-64BIT-NEXT:    rlwinm 3, 3, 3, 26, 28
 ; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
 ; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
-; LE-64BIT-NEXT:    lxvd2x 0, 7, 3
-; LE-64BIT-NEXT:    add 3, 7, 3
-; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
-; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
-; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
+; LE-64BIT-NEXT:    ldux 6, 4, 7
+; LE-64BIT-NEXT:    subfic 7, 3, 64
+; LE-64BIT-NEXT:    ld 8, 8(4)
+; LE-64BIT-NEXT:    ld 9, 16(4)
+; LE-64BIT-NEXT:    ld 4, 24(4)
+; LE-64BIT-NEXT:    srd 6, 6, 3
+; LE-64BIT-NEXT:    sld 10, 8, 7
+; LE-64BIT-NEXT:    sld 11, 4, 7
+; LE-64BIT-NEXT:    srd 8, 8, 3
+; LE-64BIT-NEXT:    sld 7, 9, 7
+; LE-64BIT-NEXT:    or 6, 10, 6
+; LE-64BIT-NEXT:    srd 10, 9, 3
+; LE-64BIT-NEXT:    srd 3, 4, 3
+; LE-64BIT-NEXT:    or 7, 7, 8
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 11, 10
+; LE-64BIT-NEXT:    std 7, 8(5)
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: lshr_32bytes:
@@ -445,79 +496,126 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    std 10, 24(9)
-; BE-NEXT:    std 10, 16(9)
-; BE-NEXT:    std 10, 8(9)
-; BE-NEXT:    std 10, -64(1)
-; BE-NEXT:    std 3, 56(9)
-; BE-NEXT:    clrlwi 3, 4, 27
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -32
+; BE-NEXT:    std 9, -40(1)
+; BE-NEXT:    std 9, -48(1)
+; BE-NEXT:    std 9, -56(1)
+; BE-NEXT:    std 9, -64(1)
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    rlwinm 3, 4, 0, 27, 28
 ; BE-NEXT:    neg 3, 3
-; BE-NEXT:    std 8, 48(9)
-; BE-NEXT:    std 7, 40(9)
-; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    std 8, -16(1)
+; BE-NEXT:    std 7, -24(1)
+; BE-NEXT:    std 6, -32(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    addi 4, 1, -32
-; BE-NEXT:    ldux 3, 4, 3
-; BE-NEXT:    ld 6, 8(4)
-; BE-NEXT:    ld 7, 24(4)
-; BE-NEXT:    ld 4, 16(4)
+; BE-NEXT:    ldux 3, 10, 3
+; BE-NEXT:    rlwinm 4, 4, 3, 26, 28
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 6, 8(10)
+; BE-NEXT:    ld 7, 24(10)
+; BE-NEXT:    ld 8, 16(10)
+; BE-NEXT:    sld 10, 3, 9
+; BE-NEXT:    srd 3, 3, 4
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 4, 16(5)
+; BE-NEXT:    srd 11, 6, 4
+; BE-NEXT:    srd 7, 7, 4
+; BE-NEXT:    sld 6, 6, 9
+; BE-NEXT:    sld 9, 8, 9
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 7, 9, 7
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    std 6, 16(5)
 ; BE-NEXT:    std 7, 24(5)
-; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: lshr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    stwu 1, -112(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 10, 12(3)
-; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 16(3)
+; LE-32BIT-NEXT:    lwz 10, 24(3)
+; LE-32BIT-NEXT:    lwz 11, 4(3)
+; LE-32BIT-NEXT:    lwz 12, 12(3)
+; LE-32BIT-NEXT:    lwz 0, 20(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 3, 76(1)
-; LE-32BIT-NEXT:    addi 3, 1, 48
-; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 6, 44(1)
-; LE-32BIT-NEXT:    sub 3, 3, 4
-; LE-32BIT-NEXT:    stw 6, 40(1)
 ; LE-32BIT-NEXT:    stw 6, 36(1)
-; LE-32BIT-NEXT:    stw 6, 32(1)
 ; LE-32BIT-NEXT:    stw 6, 28(1)
-; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 16(1)
-; LE-32BIT-NEXT:    stw 0, 72(1)
-; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    stw 11, 64(1)
-; LE-32BIT-NEXT:    stw 10, 60(1)
-; LE-32BIT-NEXT:    stw 9, 56(1)
-; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    sub 3, 3, 6
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 0, 68(1)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    stw 12, 60(1)
+; LE-32BIT-NEXT:    stw 11, 52(1)
+; LE-32BIT-NEXT:    stw 10, 72(1)
+; LE-32BIT-NEXT:    stw 9, 64(1)
+; LE-32BIT-NEXT:    stw 8, 56(1)
 ; LE-32BIT-NEXT:    stw 7, 48(1)
-; LE-32BIT-NEXT:    lwz 4, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    lwz 8, 8(3)
-; LE-32BIT-NEXT:    lwz 9, 20(3)
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    lwz 11, 24(3)
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 30, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    slw 29, 7, 0
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    srw 28, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    slw 27, 9, 0
+; LE-32BIT-NEXT:    lwz 12, 28(3)
+; LE-32BIT-NEXT:    slw 6, 6, 0
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 25, 11, 0
+; LE-32BIT-NEXT:    slw 8, 8, 0
+; LE-32BIT-NEXT:    slw 10, 10, 0
+; LE-32BIT-NEXT:    slw 0, 3, 0
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    srw 12, 12, 4
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 0, 12
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    stw 10, 16(5)
-; LE-32BIT-NEXT:    stw 9, 20(5)
-; LE-32BIT-NEXT:    stw 8, 8(5)
-; LE-32BIT-NEXT:    stw 7, 12(5)
-; LE-32BIT-NEXT:    stw 6, 0(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
-; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    or 3, 8, 11
+; LE-32BIT-NEXT:    srw 9, 9, 4
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 6, 9
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 27, 28
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -530,26 +628,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: shl_32bytes:
 ; LE-64BIT:       # %bb.0:
 ; LE-64BIT-NEXT:    li 6, 16
-; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    lwz 4, 0(4)
 ; LE-64BIT-NEXT:    xxlxor 2, 2, 2
-; LE-64BIT-NEXT:    li 7, 48
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    addi 8, 1, -32
 ; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
-; LE-64BIT-NEXT:    lwz 3, 0(4)
-; LE-64BIT-NEXT:    addi 4, 1, -64
-; LE-64BIT-NEXT:    stxvd2x 2, 4, 6
-; LE-64BIT-NEXT:    clrlwi 3, 3, 27
-; LE-64BIT-NEXT:    stxvd2x 0, 4, 7
-; LE-64BIT-NEXT:    li 7, 32
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 6
+; LE-64BIT-NEXT:    li 6, 48
+; LE-64BIT-NEXT:    rlwinm 3, 4, 0, 27, 28
+; LE-64BIT-NEXT:    rlwinm 4, 4, 3, 26, 28
 ; LE-64BIT-NEXT:    neg 3, 3
-; LE-64BIT-NEXT:    stxvd2x 1, 4, 7
-; LE-64BIT-NEXT:    stxvd2x 2, 0, 4
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
+; LE-64BIT-NEXT:    li 6, 32
 ; LE-64BIT-NEXT:    extsw 3, 3
-; LE-64BIT-NEXT:    addi 4, 1, -32
-; LE-64BIT-NEXT:    lxvd2x 0, 4, 3
-; LE-64BIT-NEXT:    add 3, 4, 3
-; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
-; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
-; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
+; LE-64BIT-NEXT:    stxvd2x 1, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 2, 0, 7
+; LE-64BIT-NEXT:    subfic 6, 4, 64
+; LE-64BIT-NEXT:    ldux 3, 8, 3
+; LE-64BIT-NEXT:    ld 7, 16(8)
+; LE-64BIT-NEXT:    ld 9, 24(8)
+; LE-64BIT-NEXT:    ld 8, 8(8)
+; LE-64BIT-NEXT:    srd 10, 7, 6
+; LE-64BIT-NEXT:    sld 9, 9, 4
+; LE-64BIT-NEXT:    sld 7, 7, 4
+; LE-64BIT-NEXT:    or 9, 9, 10
+; LE-64BIT-NEXT:    srd 10, 8, 6
+; LE-64BIT-NEXT:    srd 6, 3, 6
+; LE-64BIT-NEXT:    sld 8, 8, 4
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    or 6, 8, 6
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    or 3, 7, 10
+; LE-64BIT-NEXT:    std 9, 24(5)
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: shl_32bytes:
@@ -559,75 +672,123 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    std 10, 56(9)
-; BE-NEXT:    std 10, 48(9)
-; BE-NEXT:    std 10, 40(9)
-; BE-NEXT:    std 10, 32(9)
-; BE-NEXT:    std 3, 24(9)
-; BE-NEXT:    std 8, 16(9)
-; BE-NEXT:    std 7, 8(9)
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -64
+; BE-NEXT:    std 9, -8(1)
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 9, -24(1)
+; BE-NEXT:    std 9, -32(1)
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 8, -48(1)
+; BE-NEXT:    std 7, -56(1)
 ; BE-NEXT:    std 6, -64(1)
-; BE-NEXT:    clrldi 3, 4, 59
-; BE-NEXT:    ldux 4, 3, 9
-; BE-NEXT:    ld 6, 8(3)
-; BE-NEXT:    ld 7, 24(3)
-; BE-NEXT:    ld 3, 16(3)
-; BE-NEXT:    std 4, 0(5)
-; BE-NEXT:    std 3, 16(5)
-; BE-NEXT:    std 7, 24(5)
-; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    rlwinm 3, 4, 0, 27, 28
+; BE-NEXT:    ldux 6, 3, 10
+; BE-NEXT:    rlwinm 4, 4, 3, 26, 28
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 16(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    sld 6, 6, 4
+; BE-NEXT:    srd 10, 7, 9
+; BE-NEXT:    sld 11, 8, 4
+; BE-NEXT:    srd 8, 8, 9
+; BE-NEXT:    srd 9, 3, 9
+; BE-NEXT:    sld 7, 7, 4
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    or 10, 11, 10
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    or 7, 7, 9
+; BE-NEXT:    std 3, 24(5)
+; BE-NEXT:    std 7, 16(5)
+; BE-NEXT:    std 6, 0(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: shl_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -80(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 10, 12(3)
-; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    lwz 0, 24(3)
-; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 28(3)
+; LE-32BIT-NEXT:    lwz 11, 0(3)
+; LE-32BIT-NEXT:    lwz 12, 8(3)
+; LE-32BIT-NEXT:    lwz 0, 16(3)
+; LE-32BIT-NEXT:    lwz 3, 24(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 6, 72(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 27
-; LE-32BIT-NEXT:    stw 6, 68(1)
 ; LE-32BIT-NEXT:    stw 6, 64(1)
-; LE-32BIT-NEXT:    stw 6, 60(1)
 ; LE-32BIT-NEXT:    stw 6, 56(1)
-; LE-32BIT-NEXT:    stw 6, 52(1)
 ; LE-32BIT-NEXT:    stw 6, 48(1)
-; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 6, 68(1)
+; LE-32BIT-NEXT:    stw 6, 60(1)
+; LE-32BIT-NEXT:    stw 6, 52(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT:    stw 3, 40(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 16
-; LE-32BIT-NEXT:    stw 0, 40(1)
-; LE-32BIT-NEXT:    stw 12, 36(1)
-; LE-32BIT-NEXT:    stw 11, 32(1)
-; LE-32BIT-NEXT:    stw 10, 28(1)
-; LE-32BIT-NEXT:    stw 9, 24(1)
-; LE-32BIT-NEXT:    stw 8, 20(1)
-; LE-32BIT-NEXT:    stw 7, 16(1)
-; LE-32BIT-NEXT:    lwzux 3, 4, 3
-; LE-32BIT-NEXT:    lwz 6, 4(4)
-; LE-32BIT-NEXT:    lwz 7, 12(4)
-; LE-32BIT-NEXT:    lwz 8, 8(4)
-; LE-32BIT-NEXT:    lwz 9, 20(4)
-; LE-32BIT-NEXT:    lwz 10, 16(4)
-; LE-32BIT-NEXT:    lwz 11, 28(4)
-; LE-32BIT-NEXT:    lwz 4, 24(4)
-; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    stw 0, 32(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 12, 24(1)
+; LE-32BIT-NEXT:    subfic 12, 4, 32
+; LE-32BIT-NEXT:    stw 11, 16(1)
+; LE-32BIT-NEXT:    stw 10, 44(1)
+; LE-32BIT-NEXT:    stw 9, 36(1)
+; LE-32BIT-NEXT:    stw 8, 28(1)
+; LE-32BIT-NEXT:    stw 7, 20(1)
+; LE-32BIT-NEXT:    lwzux 3, 6, 3
+; LE-32BIT-NEXT:    lwz 7, 8(6)
+; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 8, 4(6)
+; LE-32BIT-NEXT:    lwz 9, 16(6)
+; LE-32BIT-NEXT:    srw 30, 7, 12
+; LE-32BIT-NEXT:    lwz 10, 12(6)
+; LE-32BIT-NEXT:    slw 29, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 24(6)
+; LE-32BIT-NEXT:    srw 8, 8, 12
+; LE-32BIT-NEXT:    lwz 0, 20(6)
+; LE-32BIT-NEXT:    srw 28, 9, 12
+; LE-32BIT-NEXT:    lwz 6, 28(6)
+; LE-32BIT-NEXT:    slw 27, 10, 4
+; LE-32BIT-NEXT:    srw 10, 10, 12
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    srw 26, 11, 12
+; LE-32BIT-NEXT:    slw 25, 0, 4
+; LE-32BIT-NEXT:    srw 0, 0, 12
+; LE-32BIT-NEXT:    slw 9, 9, 4
+; LE-32BIT-NEXT:    srw 12, 6, 12
+; LE-32BIT-NEXT:    slw 11, 11, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    stw 4, 28(5)
+; LE-32BIT-NEXT:    or 4, 11, 12
 ; LE-32BIT-NEXT:    stw 4, 24(5)
-; LE-32BIT-NEXT:    stw 11, 28(5)
-; LE-32BIT-NEXT:    stw 10, 16(5)
-; LE-32BIT-NEXT:    stw 9, 20(5)
-; LE-32BIT-NEXT:    stw 8, 8(5)
-; LE-32BIT-NEXT:    stw 7, 12(5)
-; LE-32BIT-NEXT:    stw 6, 4(5)
-; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    or 4, 9, 0
+; LE-32BIT-NEXT:    stw 4, 16(5)
+; LE-32BIT-NEXT:    or 4, 25, 26
+; LE-32BIT-NEXT:    stw 4, 20(5)
+; LE-32BIT-NEXT:    or 4, 7, 10
+; LE-32BIT-NEXT:    or 3, 3, 8
+; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    or 4, 27, 28
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 12(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -639,26 +800,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_32bytes:
 ; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 6, 24(3)
 ; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
-; LE-64BIT-NEXT:    ld 6, 16(3)
-; LE-64BIT-NEXT:    ld 3, 24(3)
-; LE-64BIT-NEXT:    addi 7, 1, -64
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    li 8, 16
-; LE-64BIT-NEXT:    std 3, 24(7)
-; LE-64BIT-NEXT:    sradi 3, 3, 63
-; LE-64BIT-NEXT:    std 6, 16(7)
-; LE-64BIT-NEXT:    std 3, 56(7)
-; LE-64BIT-NEXT:    std 3, 48(7)
-; LE-64BIT-NEXT:    std 3, 40(7)
-; LE-64BIT-NEXT:    std 3, 32(7)
-; LE-64BIT-NEXT:    clrldi 3, 4, 59
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    ld 3, 16(3)
+; LE-64BIT-NEXT:    sradi 8, 6, 63
+; LE-64BIT-NEXT:    rlwinm 9, 4, 0, 27, 28
 ; LE-64BIT-NEXT:    stxvd2x 0, 0, 7
-; LE-64BIT-NEXT:    lxvd2x 0, 7, 3
-; LE-64BIT-NEXT:    add 3, 7, 3
-; LE-64BIT-NEXT:    lxvd2x 1, 3, 8
-; LE-64BIT-NEXT:    stxvd2x 1, 5, 8
-; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
+; LE-64BIT-NEXT:    std 6, -40(1)
+; LE-64BIT-NEXT:    std 3, -48(1)
+; LE-64BIT-NEXT:    std 8, -8(1)
+; LE-64BIT-NEXT:    std 8, -16(1)
+; LE-64BIT-NEXT:    std 8, -24(1)
+; LE-64BIT-NEXT:    std 8, -32(1)
+; LE-64BIT-NEXT:    rlwinm 3, 4, 3, 26, 28
+; LE-64BIT-NEXT:    ldux 4, 9, 7
+; LE-64BIT-NEXT:    ld 7, 8(9)
+; LE-64BIT-NEXT:    subfic 6, 3, 64
+; LE-64BIT-NEXT:    ld 8, 16(9)
+; LE-64BIT-NEXT:    ld 9, 24(9)
+; LE-64BIT-NEXT:    srd 4, 4, 3
+; LE-64BIT-NEXT:    sld 10, 7, 6
+; LE-64BIT-NEXT:    sld 11, 9, 6
+; LE-64BIT-NEXT:    srd 7, 7, 3
+; LE-64BIT-NEXT:    sld 6, 8, 6
+; LE-64BIT-NEXT:    or 4, 10, 4
+; LE-64BIT-NEXT:    srd 10, 8, 3
+; LE-64BIT-NEXT:    srad 3, 9, 3
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 11, 10
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 4, 0(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: ashr_32bytes:
@@ -668,79 +843,126 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 9, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 6, 1, -64
-; BE-NEXT:    std 3, 56(6)
+; BE-NEXT:    addi 6, 1, -32
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    std 7, -32(1)
 ; BE-NEXT:    sradi 3, 7, 63
-; BE-NEXT:    clrlwi 4, 4, 27
-; BE-NEXT:    std 3, 24(6)
-; BE-NEXT:    std 3, 16(6)
-; BE-NEXT:    std 3, 8(6)
+; BE-NEXT:    rlwinm 7, 4, 0, 27, 28
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 3, -48(1)
+; BE-NEXT:    std 3, -56(1)
 ; BE-NEXT:    std 3, -64(1)
-; BE-NEXT:    neg 3, 4
-; BE-NEXT:    std 9, 48(6)
-; BE-NEXT:    std 8, 40(6)
-; BE-NEXT:    std 7, 32(6)
+; BE-NEXT:    neg 3, 7
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    addi 4, 1, -32
-; BE-NEXT:    ldux 3, 4, 3
-; BE-NEXT:    ld 6, 8(4)
-; BE-NEXT:    ld 7, 24(4)
-; BE-NEXT:    ld 4, 16(4)
+; BE-NEXT:    ldux 3, 6, 3
+; BE-NEXT:    rlwinm 4, 4, 3, 26, 28
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 8(6)
+; BE-NEXT:    ld 8, 24(6)
+; BE-NEXT:    ld 6, 16(6)
+; BE-NEXT:    sld 10, 3, 9
+; BE-NEXT:    srad 3, 3, 4
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 4, 16(5)
-; BE-NEXT:    std 7, 24(5)
-; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    srd 11, 7, 4
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    sld 7, 7, 9
+; BE-NEXT:    sld 9, 6, 9
+; BE-NEXT:    srd 6, 6, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 8, 9, 8
+; BE-NEXT:    or 6, 7, 6
+; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 8, 24(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: ashr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -80(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
 ; LE-32BIT-NEXT:    addi 6, 1, 48
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 10, 12(3)
-; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 8, 16(3)
+; LE-32BIT-NEXT:    lwz 9, 24(3)
+; LE-32BIT-NEXT:    lwz 10, 0(3)
+; LE-32BIT-NEXT:    lwz 11, 4(3)
+; LE-32BIT-NEXT:    lwz 12, 12(3)
+; LE-32BIT-NEXT:    lwz 0, 20(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
 ; LE-32BIT-NEXT:    stw 3, 76(1)
-; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    clrlwi 4, 4, 27
-; LE-32BIT-NEXT:    stw 0, 72(1)
-; LE-32BIT-NEXT:    stw 12, 68(1)
-; LE-32BIT-NEXT:    stw 11, 64(1)
-; LE-32BIT-NEXT:    stw 10, 60(1)
-; LE-32BIT-NEXT:    stw 9, 56(1)
-; LE-32BIT-NEXT:    stw 8, 52(1)
-; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    srawi 3, 10, 31
+; LE-32BIT-NEXT:    stw 7, 56(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 0, 27, 29
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 0, 68(1)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    stw 12, 60(1)
+; LE-32BIT-NEXT:    stw 11, 52(1)
+; LE-32BIT-NEXT:    stw 9, 72(1)
+; LE-32BIT-NEXT:    stw 8, 64(1)
+; LE-32BIT-NEXT:    stw 10, 48(1)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    stw 3, 40(1)
 ; LE-32BIT-NEXT:    stw 3, 36(1)
-; LE-32BIT-NEXT:    stw 3, 32(1)
 ; LE-32BIT-NEXT:    stw 3, 28(1)
-; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    stw 3, 32(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 16(1)
-; LE-32BIT-NEXT:    sub 3, 6, 4
-; LE-32BIT-NEXT:    lwz 4, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    lwz 8, 8(3)
-; LE-32BIT-NEXT:    lwz 9, 20(3)
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    lwz 11, 24(3)
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 30, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    slw 29, 7, 0
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    srw 28, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    slw 27, 9, 0
+; LE-32BIT-NEXT:    lwz 12, 28(3)
+; LE-32BIT-NEXT:    slw 6, 6, 0
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 25, 11, 0
+; LE-32BIT-NEXT:    slw 8, 8, 0
+; LE-32BIT-NEXT:    slw 10, 10, 0
+; LE-32BIT-NEXT:    slw 0, 3, 0
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    srw 12, 12, 4
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 0, 12
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    stw 10, 16(5)
-; LE-32BIT-NEXT:    stw 9, 20(5)
-; LE-32BIT-NEXT:    stw 8, 8(5)
-; LE-32BIT-NEXT:    stw 7, 12(5)
-; LE-32BIT-NEXT:    stw 6, 0(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
-; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    or 3, 8, 11
+; LE-32BIT-NEXT:    srw 9, 9, 4
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 6, 9
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 27, 28
+; LE-32BIT-NEXT:    sraw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 98c76a7d3887c..f0fe0765ce19b 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,20 +209,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 4(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 6, 28(1)
-; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 16(1)
 ; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 32
-; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    stw 9, 36(1)
 ; LE-32BIT-NEXT:    sub 3, 3, 6
-; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    stw 8, 40(1)
 ; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 7, 32(1)
 ; LE-32BIT-NEXT:    subfic 9, 4, 32
@@ -290,24 +290,24 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-LABEL: shl_16bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    lwz 9, 0(3)
+; LE-32BIT-NEXT:    lwz 3, 8(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
-; LE-32BIT-NEXT:    stw 6, 44(1)
 ; LE-32BIT-NEXT:    stw 6, 40(1)
-; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 29
-; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 16
-; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 9, 16(1)
 ; LE-32BIT-NEXT:    clrlwi 4, 4, 27
-; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 8, 28(1)
 ; LE-32BIT-NEXT:    subfic 8, 4, 32
-; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    stw 7, 20(1)
 ; LE-32BIT-NEXT:    lwzux 3, 6, 3
 ; LE-32BIT-NEXT:    lwz 9, 4(6)
 ; LE-32BIT-NEXT:    slw 3, 3, 4
@@ -378,23 +378,23 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-LABEL: ashr_16bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
 ; LE-32BIT-NEXT:    addi 6, 1, 32
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 8, 0(3)
+; LE-32BIT-NEXT:    lwz 9, 4(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    srawi 3, 8, 31
+; LE-32BIT-NEXT:    stw 7, 40(1)
 ; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 28, 29
-; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    stw 9, 36(1)
 ; LE-32BIT-NEXT:    clrlwi 4, 4, 27
-; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    stw 8, 32(1)
 ; LE-32BIT-NEXT:    subfic 9, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 28(1)
-; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 16(1)
 ; LE-32BIT-NEXT:    sub 3, 6, 7
 ; LE-32BIT-NEXT:    lwz 6, 4(3)
@@ -511,25 +511,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stwu 1, -112(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 10, 12(3)
-; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 16(3)
+; LE-32BIT-NEXT:    lwz 10, 24(3)
+; LE-32BIT-NEXT:    lwz 11, 4(3)
+; LE-32BIT-NEXT:    lwz 12, 12(3)
+; LE-32BIT-NEXT:    lwz 0, 20(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 6, 48(1)
 ; LE-32BIT-NEXT:    stw 6, 44(1)
-; LE-32BIT-NEXT:    stw 6, 40(1)
 ; LE-32BIT-NEXT:    stw 6, 36(1)
-; LE-32BIT-NEXT:    stw 6, 32(1)
 ; LE-32BIT-NEXT:    stw 6, 28(1)
-; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
 ; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 29
-; LE-32BIT-NEXT:    stw 3, 80(1)
-; LE-32BIT-NEXT:    addi 3, 1, 52
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    sub 3, 3, 6
 ; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
@@ -538,54 +538,52 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    stw 0, 76(1)
-; LE-32BIT-NEXT:    stw 12, 72(1)
-; LE-32BIT-NEXT:    xori 12, 4, 31
-; LE-32BIT-NEXT:    stw 11, 68(1)
-; LE-32BIT-NEXT:    stw 10, 64(1)
-; LE-32BIT-NEXT:    stw 9, 60(1)
+; LE-32BIT-NEXT:    stw 0, 68(1)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    stw 12, 60(1)
+; LE-32BIT-NEXT:    stw 11, 52(1)
+; LE-32BIT-NEXT:    stw 10, 72(1)
+; LE-32BIT-NEXT:    stw 9, 64(1)
 ; LE-32BIT-NEXT:    stw 8, 56(1)
-; LE-32BIT-NEXT:    stw 7, 52(1)
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    lwz 7, 4(3)
-; LE-32BIT-NEXT:    lwz 8, 0(3)
-; LE-32BIT-NEXT:    srw 29, 6, 4
-; LE-32BIT-NEXT:    lwz 9, 12(3)
-; LE-32BIT-NEXT:    slw 6, 6, 30
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 30, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    slw 29, 7, 0
 ; LE-32BIT-NEXT:    lwz 10, 20(3)
-; LE-32BIT-NEXT:    slw 28, 8, 30
+; LE-32BIT-NEXT:    srw 28, 8, 4
 ; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    srw 27, 9, 4
-; LE-32BIT-NEXT:    lwz 0, 28(3)
-; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 27, 9, 0
+; LE-32BIT-NEXT:    lwz 12, 28(3)
+; LE-32BIT-NEXT:    slw 6, 6, 0
 ; LE-32BIT-NEXT:    lwz 3, 24(3)
-; LE-32BIT-NEXT:    slw 25, 11, 30
-; LE-32BIT-NEXT:    slw 9, 9, 30
-; LE-32BIT-NEXT:    slw 10, 10, 30
-; LE-32BIT-NEXT:    slw 30, 3, 30
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 25, 11, 0
+; LE-32BIT-NEXT:    slw 8, 8, 0
+; LE-32BIT-NEXT:    slw 10, 10, 0
+; LE-32BIT-NEXT:    slw 0, 3, 0
 ; LE-32BIT-NEXT:    srw 3, 3, 4
-; LE-32BIT-NEXT:    srw 0, 0, 4
+; LE-32BIT-NEXT:    srw 12, 12, 4
 ; LE-32BIT-NEXT:    or 3, 10, 3
 ; LE-32BIT-NEXT:    srw 11, 11, 4
 ; LE-32BIT-NEXT:    stw 3, 24(5)
-; LE-32BIT-NEXT:    or 3, 30, 0
+; LE-32BIT-NEXT:    or 3, 0, 12
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    or 3, 9, 11
+; LE-32BIT-NEXT:    or 3, 8, 11
+; LE-32BIT-NEXT:    srw 9, 9, 4
 ; LE-32BIT-NEXT:    stw 3, 16(5)
 ; LE-32BIT-NEXT:    or 3, 25, 26
-; LE-32BIT-NEXT:    srw 8, 8, 4
-; LE-32BIT-NEXT:    srw 4, 7, 4
-; LE-32BIT-NEXT:    slwi 7, 7, 1
 ; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 6, 27
-; LE-32BIT-NEXT:    slw 7, 7, 12
+; LE-32BIT-NEXT:    or 3, 6, 9
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 27, 28
+; LE-32BIT-NEXT:    srw 4, 7, 4
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 28, 4
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 29, 7
-; LE-32BIT-NEXT:    stw 8, 0(5)
-; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
@@ -684,15 +682,15 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-LABEL: shl_32bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -112(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 10, 12(3)
-; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    lwz 0, 24(3)
-; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 28(3)
+; LE-32BIT-NEXT:    lwz 11, 0(3)
+; LE-32BIT-NEXT:    lwz 12, 8(3)
+; LE-32BIT-NEXT:    lwz 0, 16(3)
+; LE-32BIT-NEXT:    lwz 3, 24(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
@@ -700,25 +698,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 6, 80(1)
-; LE-32BIT-NEXT:    stw 6, 76(1)
 ; LE-32BIT-NEXT:    stw 6, 72(1)
-; LE-32BIT-NEXT:    stw 6, 68(1)
 ; LE-32BIT-NEXT:    stw 6, 64(1)
-; LE-32BIT-NEXT:    stw 6, 60(1)
 ; LE-32BIT-NEXT:    stw 6, 56(1)
+; LE-32BIT-NEXT:    stw 6, 48(1)
+; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 6, 68(1)
+; LE-32BIT-NEXT:    stw 6, 60(1)
 ; LE-32BIT-NEXT:    stw 6, 52(1)
 ; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 29
-; LE-32BIT-NEXT:    stw 3, 48(1)
-; LE-32BIT-NEXT:    addi 3, 1, 20
-; LE-32BIT-NEXT:    stw 0, 44(1)
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 0, 32(1)
 ; LE-32BIT-NEXT:    clrlwi 4, 4, 27
-; LE-32BIT-NEXT:    stw 12, 40(1)
+; LE-32BIT-NEXT:    stw 12, 24(1)
 ; LE-32BIT-NEXT:    subfic 12, 4, 32
-; LE-32BIT-NEXT:    stw 11, 36(1)
-; LE-32BIT-NEXT:    stw 10, 32(1)
-; LE-32BIT-NEXT:    stw 9, 28(1)
-; LE-32BIT-NEXT:    stw 8, 24(1)
+; LE-32BIT-NEXT:    stw 11, 16(1)
+; LE-32BIT-NEXT:    stw 10, 44(1)
+; LE-32BIT-NEXT:    stw 9, 36(1)
+; LE-32BIT-NEXT:    stw 8, 28(1)
 ; LE-32BIT-NEXT:    stw 7, 20(1)
 ; LE-32BIT-NEXT:    lwzux 3, 6, 3
 ; LE-32BIT-NEXT:    lwz 7, 8(6)
@@ -858,19 +856,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-LABEL: ashr_32bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -112(1)
-; LE-32BIT-NEXT:    lwz 7, 0(3)
-; LE-32BIT-NEXT:    addi 6, 1, 52
-; LE-32BIT-NEXT:    lwz 8, 4(3)
-; LE-32BIT-NEXT:    lwz 9, 8(3)
-; LE-32BIT-NEXT:    lwz 10, 12(3)
-; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    lwz 12, 20(3)
-; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 7, 8(3)
+; LE-32BIT-NEXT:    addi 6, 1, 48
+; LE-32BIT-NEXT:    lwz 8, 16(3)
+; LE-32BIT-NEXT:    lwz 9, 24(3)
+; LE-32BIT-NEXT:    lwz 10, 0(3)
+; LE-32BIT-NEXT:    lwz 11, 4(3)
+; LE-32BIT-NEXT:    lwz 12, 12(3)
+; LE-32BIT-NEXT:    lwz 0, 20(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 3, 80(1)
-; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    stw 7, 52(1)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    srawi 3, 10, 31
+; LE-32BIT-NEXT:    stw 7, 56(1)
 ; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 27, 29
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    clrlwi 4, 4, 27
@@ -879,62 +877,60 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    stw 0, 76(1)
-; LE-32BIT-NEXT:    stw 12, 72(1)
-; LE-32BIT-NEXT:    xori 12, 4, 31
-; LE-32BIT-NEXT:    stw 11, 68(1)
-; LE-32BIT-NEXT:    stw 10, 64(1)
-; LE-32BIT-NEXT:    stw 9, 60(1)
-; LE-32BIT-NEXT:    stw 8, 56(1)
-; LE-32BIT-NEXT:    stw 3, 48(1)
+; LE-32BIT-NEXT:    stw 0, 68(1)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    stw 12, 60(1)
+; LE-32BIT-NEXT:    stw 11, 52(1)
+; LE-32BIT-NEXT:    stw 9, 72(1)
+; LE-32BIT-NEXT:    stw 8, 64(1)
+; LE-32BIT-NEXT:    stw 10, 48(1)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    stw 3, 40(1)
 ; LE-32BIT-NEXT:    stw 3, 36(1)
-; LE-32BIT-NEXT:    stw 3, 32(1)
 ; LE-32BIT-NEXT:    stw 3, 28(1)
-; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    stw 3, 32(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
 ; LE-32BIT-NEXT:    sub 3, 6, 7
-; LE-32BIT-NEXT:    lwz 6, 8(3)
-; LE-32BIT-NEXT:    lwz 7, 4(3)
-; LE-32BIT-NEXT:    lwz 8, 0(3)
-; LE-32BIT-NEXT:    srw 29, 6, 4
-; LE-32BIT-NEXT:    lwz 9, 12(3)
-; LE-32BIT-NEXT:    slw 6, 6, 30
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 30, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    slw 29, 7, 0
 ; LE-32BIT-NEXT:    lwz 10, 20(3)
-; LE-32BIT-NEXT:    slw 28, 8, 30
+; LE-32BIT-NEXT:    srw 28, 8, 4
 ; LE-32BIT-NEXT:    lwz 11, 16(3)
-; LE-32BIT-NEXT:    srw 27, 9, 4
-; LE-32BIT-NEXT:    lwz 0, 28(3)
-; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 27, 9, 0
+; LE-32BIT-NEXT:    lwz 12, 28(3)
+; LE-32BIT-NEXT:    slw 6, 6, 0
 ; LE-32BIT-NEXT:    lwz 3, 24(3)
-; LE-32BIT-NEXT:    slw 25, 11, 30
-; LE-32BIT-NEXT:    slw 9, 9, 30
-; LE-32BIT-NEXT:    slw 10, 10, 30
-; LE-32BIT-NEXT:    slw 30, 3, 30
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 25, 11, 0
+; LE-32BIT-NEXT:    slw 8, 8, 0
+; LE-32BIT-NEXT:    slw 10, 10, 0
+; LE-32BIT-NEXT:    slw 0, 3, 0
 ; LE-32BIT-NEXT:    srw 3, 3, 4
-; LE-32BIT-NEXT:    srw 0, 0, 4
+; LE-32BIT-NEXT:    srw 12, 12, 4
 ; LE-32BIT-NEXT:    or 3, 10, 3
 ; LE-32BIT-NEXT:    srw 11, 11, 4
 ; LE-32BIT-NEXT:    stw 3, 24(5)
-; LE-32BIT-NEXT:    or 3, 30, 0
+; LE-32BIT-NEXT:    or 3, 0, 12
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    or 3, 9, 11
+; LE-32BIT-NEXT:    or 3, 8, 11
+; LE-32BIT-NEXT:    srw 9, 9, 4
 ; LE-32BIT-NEXT:    stw 3, 16(5)
 ; LE-32BIT-NEXT:    or 3, 25, 26
-; LE-32BIT-NEXT:    sraw 8, 8, 4
-; LE-32BIT-NEXT:    srw 4, 7, 4
-; LE-32BIT-NEXT:    slwi 7, 7, 1
 ; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 6, 27
-; LE-32BIT-NEXT:    slw 7, 7, 12
+; LE-32BIT-NEXT:    or 3, 6, 9
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 27, 28
+; LE-32BIT-NEXT:    sraw 4, 7, 4
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 28, 4
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 29, 7
-; LE-32BIT-NEXT:    stw 8, 0(5)
-; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 5ba8755201ddf..19854afba772c 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -153,18 +153,18 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    lw a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 24(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a1, 0(sp)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a3, 4(sp)
 ; RV32I-NEXT:    srli a1, a2, 3
 ; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
@@ -222,19 +222,19 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw a3, 12(a1)
-; RV32I-NEXT:    lw a4, 8(a1)
-; RV32I-NEXT:    lw a5, 4(a1)
-; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    sw a3, 12(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 4(sp)
-; RV32I-NEXT:    sw a1, 0(sp)
-; RV32I-NEXT:    srai a3, a3, 31
-; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a4, 0(sp)
+; RV32I-NEXT:    srai a3, a5, 31
 ; RV32I-NEXT:    sw a3, 24(sp)
-; RV32I-NEXT:    sw a3, 20(sp)
 ; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    sw a3, 20(sp)
 ; RV32I-NEXT:    srli a1, a2, 3
 ; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
@@ -293,16 +293,16 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    lw a2, 0(a2)
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw a5, 28(sp)
+; RV32I-NEXT:    sw a1, 20(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw a1, 28(sp)
-; RV32I-NEXT:    sw a5, 24(sp)
-; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a4, 24(sp)
 ; RV32I-NEXT:    sw a3, 16(sp)
 ; RV32I-NEXT:    srli a1, a2, 3
 ; RV32I-NEXT:    andi a1, a1, 12
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 0b87bb05cfd63..9787d17362e9d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -734,20 +734,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
@@ -775,12 +775,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw a0, 12(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    andi a0, a1, 12
 ; RV32I-NEXT:    mv a3, sp
@@ -972,20 +972,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
@@ -1013,12 +1013,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a5, 24(sp)
-; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 20(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a4, 24(sp)
 ; RV32I-NEXT:    sw a3, 16(sp)
 ; RV32I-NEXT:    andi a0, a1, 12
 ; RV32I-NEXT:    addi a3, sp, 16
@@ -1210,20 +1210,20 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
@@ -1252,12 +1252,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
-; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    andi a0, a1, 12
 ; RV32I-NEXT:    mv a3, sp
@@ -1346,20 +1346,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a5, 16(a0)
+; RV64I-NEXT:    lbu a6, 18(a0)
+; RV64I-NEXT:    lbu a7, 19(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a5, 21(a0)
+; RV64I-NEXT:    lbu a6, 20(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu t0, 23(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1368,20 +1368,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu t1, 15(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -1435,12 +1435,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
-; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a5, 8(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a4, 16(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    andi a0, a1, 24
 ; RV64I-NEXT:    mv a3, sp
@@ -1544,60 +1544,60 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t0, 26(a0)
+; RV32I-NEXT:    lbu t1, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
 ; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or t0, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    or t1, t3, t2
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    lbu t1, 21(a0)
+; RV32I-NEXT:    lbu t2, 20(a0)
+; RV32I-NEXT:    lbu t3, 22(a0)
+; RV32I-NEXT:    lbu t4, 23(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
 ; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    slli t3, t3, 16
@@ -1624,24 +1624,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    sw zero, 64(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 36(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 12(sp)
+; RV32I-NEXT:    sw a7, 4(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
 ; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    andi a0, a1, 28
-; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a5, a3, a0
 ; RV32I-NEXT:    lw a3, 4(a5)
 ; RV32I-NEXT:    slli a6, a1, 3
@@ -1777,20 +1777,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a5, 16(a0)
+; RV64I-NEXT:    lbu a6, 18(a0)
+; RV64I-NEXT:    lbu a7, 19(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a5, 21(a0)
+; RV64I-NEXT:    lbu a6, 20(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu t0, 23(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1799,20 +1799,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu t1, 15(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -1866,12 +1866,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
-; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a5, 40(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a4, 48(sp)
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    andi a0, a1, 24
 ; RV64I-NEXT:    addi a3, sp, 32
@@ -1975,60 +1975,60 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t0, 26(a0)
+; RV32I-NEXT:    lbu t1, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
 ; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or t0, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    or t1, t3, t2
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    lbu t1, 21(a0)
+; RV32I-NEXT:    lbu t2, 20(a0)
+; RV32I-NEXT:    lbu t3, 22(a0)
+; RV32I-NEXT:    lbu t4, 23(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
 ; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    slli t3, t3, 16
@@ -2055,24 +2055,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    sw zero, 32(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
-; RV32I-NEXT:    sw a0, 64(sp)
-; RV32I-NEXT:    sw t1, 60(sp)
-; RV32I-NEXT:    sw t0, 56(sp)
-; RV32I-NEXT:    sw a7, 52(sp)
-; RV32I-NEXT:    sw a6, 48(sp)
-; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t1, 52(sp)
+; RV32I-NEXT:    sw t0, 44(sp)
+; RV32I-NEXT:    sw a7, 36(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a6, 56(sp)
+; RV32I-NEXT:    sw a5, 48(sp)
 ; RV32I-NEXT:    sw a4, 40(sp)
-; RV32I-NEXT:    sw a3, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
 ; RV32I-NEXT:    andi a0, a1, 28
-; RV32I-NEXT:    addi a3, sp, 36
+; RV32I-NEXT:    addi a3, sp, 32
 ; RV32I-NEXT:    sub a6, a3, a0
 ; RV32I-NEXT:    lw a3, 4(a6)
 ; RV32I-NEXT:    slli a7, a1, 3
@@ -2208,20 +2208,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a5, 16(a0)
+; RV64I-NEXT:    lbu a6, 18(a0)
+; RV64I-NEXT:    lbu a7, 19(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a5, 21(a0)
+; RV64I-NEXT:    lbu a6, 20(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu t0, 23(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2230,20 +2230,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu t1, 15(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -2298,12 +2298,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    sraiw a0, a0, 31
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
 ; RV64I-NEXT:    sd a0, 40(sp)
-; RV64I-NEXT:    sd a0, 32(sp)
 ; RV64I-NEXT:    sd a6, 24(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a5, 8(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a4, 16(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    andi a0, a1, 24
 ; RV64I-NEXT:    mv a3, sp
@@ -2407,60 +2407,60 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t0, 26(a0)
+; RV32I-NEXT:    lbu t1, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
 ; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    lbu t0, 21(a0)
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    lbu t0, 13(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or t0, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    or t1, t3, t2
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 25(a0)
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    lbu t1, 21(a0)
+; RV32I-NEXT:    lbu t2, 20(a0)
+; RV32I-NEXT:    lbu t3, 22(a0)
+; RV32I-NEXT:    lbu t4, 23(a0)
 ; RV32I-NEXT:    slli t1, t1, 8
 ; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    slli t3, t3, 16
@@ -2488,24 +2488,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t5
 ; RV32I-NEXT:    or a1, a1, t3
 ; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    sw a0, 64(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    sw a0, 48(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw t2, 32(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 12(sp)
+; RV32I-NEXT:    sw a7, 4(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
 ; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    andi a0, a1, 28
-; RV32I-NEXT:    addi a3, sp, 4
+; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a5, a3, a0
 ; RV32I-NEXT:    lw a3, 4(a5)
 ; RV32I-NEXT:    slli a6, a1, 3
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 7e879b137b4f0..d85b9430c4fec 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -715,20 +715,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
@@ -756,12 +756,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw a0, 12(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    srli a0, a1, 3
 ; RV32I-NEXT:    andi a0, a0, 12
@@ -951,20 +951,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
@@ -992,12 +992,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a5, 24(sp)
-; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 20(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a4, 24(sp)
 ; RV32I-NEXT:    sw a3, 16(sp)
 ; RV32I-NEXT:    srli a0, a1, 3
 ; RV32I-NEXT:    andi a0, a0, 12
@@ -1187,20 +1187,20 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu t0, 7(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
@@ -1229,12 +1229,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
-; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    srli a0, a1, 3
 ; RV32I-NEXT:    andi a0, a0, 12
@@ -1322,20 +1322,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a5, 16(a0)
+; RV64I-NEXT:    lbu a6, 18(a0)
+; RV64I-NEXT:    lbu a7, 19(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a5, 21(a0)
+; RV64I-NEXT:    lbu a6, 20(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu t0, 23(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1344,20 +1344,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu t1, 15(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -1411,12 +1411,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
-; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a5, 8(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a4, 16(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a0, a0, 24
@@ -1517,60 +1517,60 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t0, 26(a0)
+; RV32I-NEXT:    lbu t1, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
 ; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a7, 21(a0)
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    or t1, t3, t2
 ; RV32I-NEXT:    or t1, t1, a7
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t2, 20(a0)
+; RV32I-NEXT:    lbu t3, 22(a0)
+; RV32I-NEXT:    lbu t4, 23(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
@@ -1598,20 +1598,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    sw zero, 48(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 36(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw t2, 20(sp)
+; RV32I-NEXT:    sw t1, 12(sp)
+; RV32I-NEXT:    sw t0, 4(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    srli a0, a7, 3
 ; RV32I-NEXT:    andi a0, a0, 28
@@ -1746,20 +1746,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a5, 16(a0)
+; RV64I-NEXT:    lbu a6, 18(a0)
+; RV64I-NEXT:    lbu a7, 19(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a5, 21(a0)
+; RV64I-NEXT:    lbu a6, 20(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu t0, 23(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1768,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu t1, 15(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -1835,12 +1835,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
-; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a5, 40(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a4, 48(sp)
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a0, a0, 24
@@ -1941,60 +1941,60 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t0, 26(a0)
+; RV32I-NEXT:    lbu t1, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
 ; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a7, 21(a0)
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    or t1, t3, t2
 ; RV32I-NEXT:    or t1, t1, a7
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t2, 20(a0)
+; RV32I-NEXT:    lbu t3, 22(a0)
+; RV32I-NEXT:    lbu t4, 23(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
@@ -2022,20 +2022,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw t2, 56(sp)
-; RV32I-NEXT:    sw t1, 52(sp)
-; RV32I-NEXT:    sw t0, 48(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw t2, 52(sp)
+; RV32I-NEXT:    sw t1, 44(sp)
+; RV32I-NEXT:    sw t0, 36(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a6, 56(sp)
+; RV32I-NEXT:    sw a5, 48(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
 ; RV32I-NEXT:    sw a3, 32(sp)
 ; RV32I-NEXT:    srli a0, a7, 3
 ; RV32I-NEXT:    andi a0, a0, 28
@@ -2170,20 +2170,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 9(a0)
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu a5, 16(a0)
+; RV64I-NEXT:    lbu a6, 18(a0)
+; RV64I-NEXT:    lbu a7, 19(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 13(a0)
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a5, 21(a0)
+; RV64I-NEXT:    lbu a6, 20(a0)
+; RV64I-NEXT:    lbu a7, 22(a0)
+; RV64I-NEXT:    lbu t0, 23(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2192,20 +2192,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 17(a0)
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu t1, 15(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -2260,12 +2260,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    sraiw a0, a0, 31
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
 ; RV64I-NEXT:    sd a0, 40(sp)
-; RV64I-NEXT:    sd a0, 32(sp)
 ; RV64I-NEXT:    sd a6, 24(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a5, 8(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a4, 16(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    srli a0, a1, 3
 ; RV64I-NEXT:    andi a0, a0, 24
@@ -2366,60 +2366,60 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a7, 11(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a5, a7, a6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    lbu a5, 9(a0)
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    lbu a5, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu a7, 18(a0)
+; RV32I-NEXT:    lbu t0, 19(a0)
 ; RV32I-NEXT:    slli a5, a5, 8
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 13(a0)
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    lbu a6, 25(a0)
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t0, 26(a0)
+; RV32I-NEXT:    lbu t1, 27(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
 ; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    lbu a7, 17(a0)
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    lbu a7, 5(a0)
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or t0, t0, a7
-; RV32I-NEXT:    lbu a7, 21(a0)
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu t3, 15(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
 ; RV32I-NEXT:    or t1, t3, t2
 ; RV32I-NEXT:    or t1, t1, a7
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t2, 20(a0)
+; RV32I-NEXT:    lbu t3, 22(a0)
+; RV32I-NEXT:    lbu t4, 23(a0)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
@@ -2448,20 +2448,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a7, a1, a7
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    sw a0, 48(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
 ; RV32I-NEXT:    sw t3, 28(sp)
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw t2, 20(sp)
+; RV32I-NEXT:    sw t1, 12(sp)
+; RV32I-NEXT:    sw t0, 4(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    srli a0, a7, 3
 ; RV32I-NEXT:    andi a0, a0, 28
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 54106bde42527..33654bb250b1e 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -174,93 +174,96 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i128:
 ; X86:       # %bb.0: # %_udiv-special-cases
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $176, %esp
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 16(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %ebp, %edx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %edi, %edi
 ; X86-NEXT:    xorl $31, %edi
 ; X86-NEXT:    addl $32, %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %edi
 ; X86-NEXT:    addl $64, %edi
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    bsrl %ebp, %ecx
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -269,133 +272,131 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    subl %edx, %edi
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %edi, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %ecx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    jne .LBB4_1
-; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_8
+; X86-NEXT:  # %bb.1: # %_udiv-special-cases
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    je .LBB4_9
-; X86-NEXT:  # %bb.5: # %udiv-bb1
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    je .LBB4_8
+; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 144(%esp,%eax), %edx
-; X86-NEXT:    movl 148(%esp,%eax), %esi
+; X86-NEXT:    movl 152(%esp,%eax), %edx
+; X86-NEXT:    movl 156(%esp,%eax), %esi
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esp,%eax), %esi
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl 148(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    movl 144(%esp,%eax), %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shldl %cl, %esi, %eax
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    jae .LBB4_2
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_9
-; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jmp .LBB4_7
+; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 108(%esp,%eax), %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl 100(%esp,%edx), %ebx
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl 104(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 100(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl 96(%esp,%eax), %eax
 ; X86-NEXT:    shrdl %cl, %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esp,%edx), %eax
-; X86-NEXT:    movl 92(%esp,%edx), %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    shrl %cl, %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -410,174 +411,177 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_3: # %udiv-do-while
+; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl $1, %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:  .LBB4_9: # %udiv-end
-; X86-NEXT:    xorl %ebp, %eax
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    subl %ebp, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, (%edi)
-; X86-NEXT:    movl %ecx, 4(%edi)
-; X86-NEXT:    movl %edx, 8(%edi)
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:  .LBB4_8: # %udiv-end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl %ebx, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $152, %esp
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 84f35c6485abe..d4f62d1aa7c1c 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -174,68 +174,69 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i128:
 ; X86:       # %bb.0: # %_udiv-special-cases
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $132, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl 36(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sete %bl
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    orl 24(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %bl, %al
-; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    bsrl 36(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebp, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    addl $32, %ebp
+; X86-NEXT:    bsrl 28(%ebp), %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    addl $64, %ebp
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    addl $64, %eax
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    bsrl %esi, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bsrl 12(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    subl %edx, %ebp
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %ebx
@@ -243,7 +244,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
@@ -251,9 +252,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    orb (%esp), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %eax
@@ -263,35 +263,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    cmovnel %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    cmovnel %edi, %edx
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movb %cl, %ah
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovnel %ecx, %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    cmovnel %ecx, %edi
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    cmovnel %esi, %ecx
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    jne .LBB4_7
 ; X86-NEXT:  # %bb.1: # %udiv-bb1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
@@ -300,144 +296,142 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 124(%esp,%eax), %ebp
-; X86-NEXT:    movl 128(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %ebp, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%esp,%eax), %edx
-; X86-NEXT:    movl 120(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    shldl %cl, %edx, %ebp
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 136(%esp,%eax), %edx
+; X86-NEXT:    movl 140(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 132(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl 128(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    jmp .LBB4_6
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 80(%esp,%eax), %esi
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%eax), %ecx
+; X86-NEXT:    movzbl %al, %esi
+; X86-NEXT:    movl 92(%esp,%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %eax
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 84(%esp,%esi), %edx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%esi), %eax
+; X86-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    shldl $1, %edx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %edi, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    andl 40(%ebp), %edi
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl 36(%ebp), %esi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl 32(%ebp), %edx
+; X86-NEXT:    andl 28(%ebp), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
@@ -445,96 +439,95 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    shldl $1, %ebp, %ebx
-; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %ebp
-; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:  .LBB4_7: # %udiv-end
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, (%esi)
-; X86-NEXT:    movl %ebp, 4(%esi)
-; X86-NEXT:    movl %ebx, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    imull %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $132, %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 3dbd0213293bb..17ad1ef67ef79 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,49 +22,49 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl $176, %esp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    movzbl (%eax), %ecx
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    divb %cl
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shll $30, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $30, %ecx
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edi
 ; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    shrdl $1, %eax, %ecx
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    subl %ecx, %edi
+; X86-NEXT:    sbbl %eax, %esi
 ; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    andl $3, %edx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    bsrl %edi, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    bsrl %esi, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB0_3: # %BB_udiv-special-cases
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_4
 ; X86-NEXT:  # %bb.5: # %BB_udiv-special-cases
 ; X86-NEXT:    addl $64, %eax
 ; X86-NEXT:    jmp .LBB0_6
 ; X86-NEXT:  .LBB0_4:
-; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    bsrl %edx, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:  .LBB0_6: # %BB_udiv-special-cases
@@ -104,32 +104,28 @@ define void @f() nounwind {
 ; X86-NEXT:    movsbl %al, %esi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 128(%esp,%esi), %edi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 132(%esp,%esi), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 112(%esp,%esi), %eax
-; X86-NEXT:    movl 116(%esp,%esi), %edx
-; X86-NEXT:    movl 120(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl 136(%esp,%esi), %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB0_11
 ; X86-NEXT:  # %bb.9: # %udiv-preheader
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -139,14 +135,17 @@ define void @f() nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NEXT:    movl 68(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl 88(%esp,%eax), %edx
+; X86-NEXT:    movl 84(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 80(%esp,%eax), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
@@ -163,29 +162,30 @@ define void @f() nounwind {
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB0_10: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    andl $2, %edx
 ; X86-NEXT:    shrl %edx
-; X86-NEXT:    leal (%edx,%ebx,2), %ebx
+; X86-NEXT:    leal (%edx,%esi,2), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    shldl $1, %edx, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    shll $30, %edx
@@ -200,10 +200,10 @@ define void @f() nounwind {
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    subl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    subl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index b2ff06798aad7..37dab8a80e0b4 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -13,53 +13,48 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-LABEL: test1:
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorps %xmm0, %xmm0
+; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    leal (%rsi,%rsi), %ecx
 ; ILP-NEXT:    addb $3, %cl
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movl %ecx, %edx
 ; ILP-NEXT:    shrb $3, %dl
+; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    andb $24, %dl
 ; ILP-NEXT:    negb %dl
 ; ILP-NEXT:    movsbq %dl, %rdx
-; ILP-NEXT:    movq -16(%rsp,%rdx), %rsi
-; ILP-NEXT:    movq -8(%rsp,%rdx), %rdi
-; ILP-NEXT:    shldq %cl, %rsi, %rdi
+; ILP-NEXT:    movq -40(%rsp,%rdx), %rsi
+; ILP-NEXT:    movq -24(%rsp,%rdx), %rdi
 ; ILP-NEXT:    movq -32(%rsp,%rdx), %r8
-; ILP-NEXT:    movq -24(%rsp,%rdx), %rdx
+; ILP-NEXT:    movq -16(%rsp,%rdx), %rdx
 ; ILP-NEXT:    movq %r8, %r9
-; ILP-NEXT:    shlq %cl, %r9
-; ILP-NEXT:    movq %rdx, %r10
-; ILP-NEXT:    shldq %cl, %r8, %r10
-; ILP-NEXT:    movq %rdi, 24(%rax)
-; ILP-NEXT:    movq %r10, 8(%rax)
-; ILP-NEXT:    movq %r9, (%rax)
+; ILP-NEXT:    shldq %cl, %rsi, %r9
+; ILP-NEXT:    shldq %cl, %rdi, %rdx
 ; ILP-NEXT:    shlq %cl, %rsi
+; ILP-NEXT:    movq %rdx, 24(%rax)
+; ILP-NEXT:    movq %r9, 8(%rax)
+; ILP-NEXT:    movq %rsi, (%rax)
+; ILP-NEXT:    shlq %cl, %rdi
 ; ILP-NEXT:    notb %cl
-; ILP-NEXT:    shrq %rdx
+; ILP-NEXT:    shrq %r8
 ; ILP-NEXT:    # kill: def $cl killed $cl killed $ecx
-; ILP-NEXT:    shrq %cl, %rdx
-; ILP-NEXT:    orq %rsi, %rdx
-; ILP-NEXT:    movq %rdx, 16(%rax)
+; ILP-NEXT:    shrq %cl, %r8
+; ILP-NEXT:    orq %rdi, %r8
+; ILP-NEXT:    movq %r8, 16(%rax)
 ; ILP-NEXT:    retq
 ;
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    movq %rdi, %rax
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    xorps %xmm0, %xmm0
+; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    leal (%rsi,%rsi), %ecx
 ; HYBRID-NEXT:    addb $3, %cl
 ; HYBRID-NEXT:    movl %ecx, %edx
@@ -67,37 +62,35 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    andb $24, %dl
 ; HYBRID-NEXT:    negb %dl
 ; HYBRID-NEXT:    movsbq %dl, %rdx
-; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rsi
-; HYBRID-NEXT:    movq -8(%rsp,%rdx), %rdi
-; HYBRID-NEXT:    shldq %cl, %rsi, %rdi
-; HYBRID-NEXT:    movq %rdi, 24(%rax)
-; HYBRID-NEXT:    movq -32(%rsp,%rdx), %rdi
-; HYBRID-NEXT:    movq -24(%rsp,%rdx), %rdx
-; HYBRID-NEXT:    movq %rdx, %r8
-; HYBRID-NEXT:    shldq %cl, %rdi, %r8
-; HYBRID-NEXT:    movq %r8, 8(%rax)
-; HYBRID-NEXT:    shlq %cl, %rdi
-; HYBRID-NEXT:    movq %rdi, (%rax)
+; HYBRID-NEXT:    movq -40(%rsp,%rdx), %rsi
+; HYBRID-NEXT:    movq -24(%rsp,%rdx), %rdi
+; HYBRID-NEXT:    movq -32(%rsp,%rdx), %r8
+; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rdx
+; HYBRID-NEXT:    shldq %cl, %rdi, %rdx
+; HYBRID-NEXT:    movq %rdx, 24(%rax)
+; HYBRID-NEXT:    movq %r8, %rdx
+; HYBRID-NEXT:    shldq %cl, %rsi, %rdx
+; HYBRID-NEXT:    movq %rdx, 8(%rax)
 ; HYBRID-NEXT:    shlq %cl, %rsi
+; HYBRID-NEXT:    movq %rsi, (%rax)
+; HYBRID-NEXT:    shlq %cl, %rdi
 ; HYBRID-NEXT:    notb %cl
-; HYBRID-NEXT:    shrq %rdx
+; HYBRID-NEXT:    shrq %r8
 ; HYBRID-NEXT:    # kill: def $cl killed $cl killed $ecx
-; HYBRID-NEXT:    shrq %cl, %rdx
-; HYBRID-NEXT:    orq %rsi, %rdx
-; HYBRID-NEXT:    movq %rdx, 16(%rax)
+; HYBRID-NEXT:    shrq %cl, %r8
+; HYBRID-NEXT:    orq %rdi, %r8
+; HYBRID-NEXT:    movq %r8, 16(%rax)
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    movq %rdi, %rax
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    xorps %xmm0, %xmm0
+; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    leal (%rsi,%rsi), %ecx
 ; BURR-NEXT:    addb $3, %cl
 ; BURR-NEXT:    movl %ecx, %edx
@@ -105,24 +98,24 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    andb $24, %dl
 ; BURR-NEXT:    negb %dl
 ; BURR-NEXT:    movsbq %dl, %rdx
-; BURR-NEXT:    movq -16(%rsp,%rdx), %rsi
-; BURR-NEXT:    movq -8(%rsp,%rdx), %rdi
-; BURR-NEXT:    shldq %cl, %rsi, %rdi
-; BURR-NEXT:    movq %rdi, 24(%rax)
-; BURR-NEXT:    movq -32(%rsp,%rdx), %rdi
-; BURR-NEXT:    movq -24(%rsp,%rdx), %rdx
-; BURR-NEXT:    movq %rdx, %r8
-; BURR-NEXT:    shldq %cl, %rdi, %r8
-; BURR-NEXT:    movq %r8, 8(%rax)
-; BURR-NEXT:    shlq %cl, %rdi
-; BURR-NEXT:    movq %rdi, (%rax)
+; BURR-NEXT:    movq -40(%rsp,%rdx), %rsi
+; BURR-NEXT:    movq -24(%rsp,%rdx), %rdi
+; BURR-NEXT:    movq -32(%rsp,%rdx), %r8
+; BURR-NEXT:    movq -16(%rsp,%rdx), %rdx
+; BURR-NEXT:    shldq %cl, %rdi, %rdx
+; BURR-NEXT:    movq %rdx, 24(%rax)
+; BURR-NEXT:    movq %r8, %rdx
+; BURR-NEXT:    shldq %cl, %rsi, %rdx
+; BURR-NEXT:    movq %rdx, 8(%rax)
 ; BURR-NEXT:    shlq %cl, %rsi
+; BURR-NEXT:    movq %rsi, (%rax)
+; BURR-NEXT:    shlq %cl, %rdi
 ; BURR-NEXT:    notb %cl
-; BURR-NEXT:    shrq %rdx
+; BURR-NEXT:    shrq %r8
 ; BURR-NEXT:    # kill: def $cl killed $cl killed $ecx
-; BURR-NEXT:    shrq %cl, %rdx
-; BURR-NEXT:    orq %rsi, %rdx
-; BURR-NEXT:    movq %rdx, 16(%rax)
+; BURR-NEXT:    shrq %cl, %r8
+; BURR-NEXT:    orq %rdi, %r8
+; BURR-NEXT:    movq %r8, 16(%rax)
 ; BURR-NEXT:    retq
 ;
 ; SRC-LABEL: test1:
@@ -130,36 +123,33 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    movq %rdi, %rax
 ; SRC-NEXT:    leal (%rsi,%rsi), %edx
 ; SRC-NEXT:    addb $3, %dl
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    xorps %xmm0, %xmm0
+; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movl %edx, %ecx
 ; SRC-NEXT:    shrb $3, %cl
 ; SRC-NEXT:    andb $24, %cl
 ; SRC-NEXT:    negb %cl
 ; SRC-NEXT:    movsbq %cl, %rsi
-; SRC-NEXT:    movq -16(%rsp,%rsi), %rdi
-; SRC-NEXT:    movq %rdi, %r8
+; SRC-NEXT:    movq -40(%rsp,%rsi), %rdi
+; SRC-NEXT:    movq -24(%rsp,%rsi), %r8
+; SRC-NEXT:    movq %r8, %r9
 ; SRC-NEXT:    movl %edx, %ecx
-; SRC-NEXT:    shlq %cl, %r8
+; SRC-NEXT:    shlq %cl, %r9
+; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    notb %cl
-; SRC-NEXT:    movq -32(%rsp,%rsi), %r9
-; SRC-NEXT:    movq -24(%rsp,%rsi), %r10
+; SRC-NEXT:    movq -32(%rsp,%rsi), %r10
+; SRC-NEXT:    movq -16(%rsp,%rsi), %rsi
 ; SRC-NEXT:    movq %r10, %r11
 ; SRC-NEXT:    shrq %r11
 ; SRC-NEXT:    shrq %cl, %r11
-; SRC-NEXT:    orq %r8, %r11
-; SRC-NEXT:    movq -8(%rsp,%rsi), %rsi
+; SRC-NEXT:    orq %r9, %r11
 ; SRC-NEXT:    movl %edx, %ecx
-; SRC-NEXT:    shldq %cl, %rdi, %rsi
-; SRC-NEXT:    movq %r9, %rdi
+; SRC-NEXT:    shldq %cl, %rdi, %r10
+; SRC-NEXT:    shldq %cl, %r8, %rsi
 ; SRC-NEXT:    shlq %cl, %rdi
-; SRC-NEXT:    shldq %cl, %r9, %r10
 ; SRC-NEXT:    movq %rsi, 24(%rax)
 ; SRC-NEXT:    movq %r10, 8(%rax)
 ; SRC-NEXT:    movq %rdi, (%rax)
@@ -176,24 +166,22 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    andb $24, %cl
 ; LIN-NEXT:    negb %cl
 ; LIN-NEXT:    movsbq %cl, %rsi
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    xorps %xmm0, %xmm0
+; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; LIN-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq -32(%rsp,%rsi), %rdi
+; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq -40(%rsp,%rsi), %rdi
 ; LIN-NEXT:    movq %rdi, %r8
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shlq %cl, %r8
 ; LIN-NEXT:    movq %r8, (%rax)
-; LIN-NEXT:    movq -24(%rsp,%rsi), %r8
+; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq -32(%rsp,%rsi), %r8
 ; LIN-NEXT:    movq %r8, %r9
 ; LIN-NEXT:    shldq %cl, %rdi, %r9
 ; LIN-NEXT:    movq %r9, 8(%rax)
-; LIN-NEXT:    movq -16(%rsp,%rsi), %rdi
+; LIN-NEXT:    movq -24(%rsp,%rsi), %rdi
 ; LIN-NEXT:    movq %rdi, %r9
 ; LIN-NEXT:    shlq %cl, %r9
 ; LIN-NEXT:    shrq %r8
@@ -201,7 +189,7 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    shrq %cl, %r8
 ; LIN-NEXT:    orq %r9, %r8
 ; LIN-NEXT:    movq %r8, 16(%rax)
-; LIN-NEXT:    movq -8(%rsp,%rsi), %rsi
+; LIN-NEXT:    movq -16(%rsp,%rsi), %rsi
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shldq %cl, %rdi, %rsi
 ; LIN-NEXT:    movq %rsi, 24(%rax)
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index ed1ba5c59e500..e6d716afdbe8b 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -10,43 +10,44 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-LABEL: test_lshr_i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $32, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 16(%ebp), %eax
+; i686-NEXT:    movl 20(%ebp), %edx
+; i686-NEXT:    movl 8(%ebp), %esi
+; i686-NEXT:    movl 12(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, (%esp)
+; i686-NEXT:    movl %esi, (%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    movzbl %al, %edi
+; i686-NEXT:    movl 4(%esp,%edi), %edx
+; i686-NEXT:    movl (%esp,%edi), %eax
+; i686-NEXT:    shrdl %cl, %edx, %eax
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%esp,%edi), %esi
+; i686-NEXT:    shrdl %cl, %esi, %edx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, %edx
-; i686-NEXT:    shrb $3, %dl
-; i686-NEXT:    andb $12, %dl
-; i686-NEXT:    movzbl %dl, %ebx
-; i686-NEXT:    movl 8(%esp,%ebx), %esi
-; i686-NEXT:    movl (%esp,%ebx), %edx
-; i686-NEXT:    movl 4(%esp,%ebx), %ebp
-; i686-NEXT:    movl %ebp, %edi
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    movl 12(%esp,%ebx), %ebx
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    shrdl %cl, %ebp, %edx
+; i686-NEXT:    movl 12(%esp,%edi), %edi
+; i686-NEXT:    shrdl %cl, %edi, %esi
+; i686-NEXT:    movl 40(%ebp), %ebx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, %ebx
-; i686-NEXT:    movl %ebx, 12(%eax)
-; i686-NEXT:    movl %esi, 8(%eax)
-; i686-NEXT:    movl %edi, 4(%eax)
-; i686-NEXT:    movl %edx, (%eax)
-; i686-NEXT:    addl $32, %esp
+; i686-NEXT:    shrl %cl, %edi
+; i686-NEXT:    movl %edi, 12(%ebx)
+; i686-NEXT:    movl %esi, 8(%ebx)
+; i686-NEXT:    movl %edx, 4(%ebx)
+; i686-NEXT:    movl %eax, (%ebx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -75,44 +76,45 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-LABEL: test_ashr_i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $32, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 16(%ebp), %eax
+; i686-NEXT:    movl 20(%ebp), %esi
+; i686-NEXT:    movl 8(%ebp), %edx
+; i686-NEXT:    movl 12(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, (%esp)
-; i686-NEXT:    sarl $31, %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, %edx
-; i686-NEXT:    shrb $3, %dl
-; i686-NEXT:    andb $12, %dl
-; i686-NEXT:    movzbl %dl, %ebx
-; i686-NEXT:    movl 8(%esp,%ebx), %esi
-; i686-NEXT:    movl (%esp,%ebx), %edx
-; i686-NEXT:    movl 4(%esp,%ebx), %ebp
-; i686-NEXT:    movl %ebp, %edi
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    movl 12(%esp,%ebx), %ebx
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    shrdl %cl, %ebp, %edx
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    movzbl %al, %edi
+; i686-NEXT:    movl 4(%esp,%edi), %edx
+; i686-NEXT:    movl (%esp,%edi), %eax
+; i686-NEXT:    shrdl %cl, %edx, %eax
+; i686-NEXT:    movl 8(%esp,%edi), %esi
+; i686-NEXT:    shrdl %cl, %esi, %edx
+; i686-NEXT:    movl 12(%esp,%edi), %edi
+; i686-NEXT:    shrdl %cl, %edi, %esi
+; i686-NEXT:    movl 40(%ebp), %ebx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    sarl %cl, %ebx
-; i686-NEXT:    movl %ebx, 12(%eax)
-; i686-NEXT:    movl %esi, 8(%eax)
-; i686-NEXT:    movl %edi, 4(%eax)
-; i686-NEXT:    movl %edx, (%eax)
-; i686-NEXT:    addl $32, %esp
+; i686-NEXT:    sarl %cl, %edi
+; i686-NEXT:    movl %edi, 12(%ebx)
+; i686-NEXT:    movl %esi, 8(%ebx)
+; i686-NEXT:    movl %edx, 4(%ebx)
+; i686-NEXT:    movl %eax, (%ebx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -142,45 +144,49 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-LABEL: test_shl_i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $32, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $64, %esp
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 16(%ebp), %eax
+; i686-NEXT:    movl 20(%ebp), %edx
+; i686-NEXT:    movl 8(%ebp), %esi
+; i686-NEXT:    movl 12(%ebp), %edi
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    negb %al
+; i686-NEXT:    movsbl %al, %esi
+; i686-NEXT:    movl 32(%esp,%esi), %eax
+; i686-NEXT:    movl 36(%esp,%esi), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shldl %cl, %eax, %edx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl $0, (%esp)
-; i686-NEXT:    movl %ecx, %edx
-; i686-NEXT:    shrb $3, %dl
-; i686-NEXT:    andb $12, %dl
-; i686-NEXT:    negb %dl
-; i686-NEXT:    movsbl %dl, %edi
-; i686-NEXT:    movl 16(%esp,%edi), %edx
-; i686-NEXT:    movl 20(%esp,%edi), %esi
-; i686-NEXT:    movl 24(%esp,%edi), %ebx
-; i686-NEXT:    movl %ebx, %ebp
-; i686-NEXT:    shldl %cl, %esi, %ebp
-; i686-NEXT:    movl 28(%esp,%edi), %edi
+; i686-NEXT:    movl 40(%esp,%esi), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %ebx, %edi
-; i686-NEXT:    movl %edi, 12(%eax)
-; i686-NEXT:    movl %ebp, 8(%eax)
-; i686-NEXT:    movl %edx, %edi
-; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 44(%esp,%esi), %esi
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    shldl %cl, %ebx, %esi
+; i686-NEXT:    movl 40(%ebp), %ebx
+; i686-NEXT:    movl %esi, 12(%ebx)
+; i686-NEXT:    movl %edi, 8(%ebx)
+; i686-NEXT:    movl %edx, 4(%ebx)
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edx, %esi
-; i686-NEXT:    movl %esi, 4(%eax)
-; i686-NEXT:    movl %edi, (%eax)
-; i686-NEXT:    addl $32, %esp
+; i686-NEXT:    shll %cl, %eax
+; i686-NEXT:    movl %eax, (%ebx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -243,88 +249,87 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-LABEL: test_lshr_v2i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %ecx
+; i686-NEXT:    movl 24(%ebp), %esi
+; i686-NEXT:    movl 12(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 16(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 20(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 28(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    andl $31, %ecx
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 36(%esp,%edx), %esi
+; i686-NEXT:    movl 32(%esp,%edx), %eax
+; i686-NEXT:    shrdl %cl, %esi, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 40(%esp,%edx), %eax
+; i686-NEXT:    shrdl %cl, %eax, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 44(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %ecx, %edi
+; i686-NEXT:    shrdl %cl, %edx, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 56(%ebp), %edx
 ; i686-NEXT:    movl %edx, %eax
 ; i686-NEXT:    andl $31, %eax
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 36(%esp,%edx), %edi
-; i686-NEXT:    movl 28(%esp,%edx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 32(%esp,%edx), %ebx
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 68(%esp,%edx), %ebx
+; i686-NEXT:    movl 64(%esp,%edx), %esi
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %edi, %ebx
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 40(%esp,%edx), %edx
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    shrdl %cl, %edx, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    shrdl %cl, %ebx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 72(%esp,%edx), %esi
+; i686-NEXT:    shrdl %cl, %esi, %ebx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, %edx
-; i686-NEXT:    andl $31, %edx
-; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $12, %esi
-; i686-NEXT:    movl 68(%esp,%esi), %ebp
-; i686-NEXT:    movl 64(%esp,%esi), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 60(%esp,%esi), %edi
-; i686-NEXT:    movl 72(%esp,%esi), %esi
-; i686-NEXT:    shrdl %cl, %esi, %ebp
+; i686-NEXT:    movl 76(%esp,%edx), %edx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT:    shrl %cl, %edi
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, %edi
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %esi, 28(%eax)
-; i686-NEXT:    movl %ebp, 24(%eax)
+; i686-NEXT:    shrl %cl, %edx
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edx, 28(%eax)
+; i686-NEXT:    movl %esi, 24(%eax)
+; i686-NEXT:    movl %ebx, 20(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %edi, 16(%eax)
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl %ecx, 16(%eax)
+; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, 8(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, 4(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    addl $92, %esp
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -365,82 +370,80 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-LABEL: test_ashr_v2i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %esi
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 12(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 16(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 20(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 28(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %eax
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    andl $31, %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 36(%esp,%edx), %esi
+; i686-NEXT:    movl 32(%esp,%edx), %eax
+; i686-NEXT:    shrdl %cl, %esi, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 40(%esp,%edx), %eax
+; i686-NEXT:    shrdl %cl, %eax, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 44(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrdl %cl, %edx, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 56(%ebp), %edx
 ; i686-NEXT:    movl %edx, %eax
 ; i686-NEXT:    andl $31, %eax
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 36(%esp,%edx), %edi
-; i686-NEXT:    movl 28(%esp,%edx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 32(%esp,%edx), %ebx
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 68(%esp,%edx), %ebx
+; i686-NEXT:    movl 64(%esp,%edx), %edi
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %edi, %ebx
-; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 40(%esp,%edx), %edx
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    shrdl %cl, %edx, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, %edx
-; i686-NEXT:    andl $31, %edx
-; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $12, %esi
-; i686-NEXT:    movl 68(%esp,%esi), %ebp
-; i686-NEXT:    movl 64(%esp,%esi), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 60(%esp,%esi), %edi
-; i686-NEXT:    movl 72(%esp,%esi), %esi
-; i686-NEXT:    shrdl %cl, %esi, %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    sarl %cl, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, %edi
-; i686-NEXT:    sarl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %esi, 28(%eax)
-; i686-NEXT:    movl %ebp, 24(%eax)
+; i686-NEXT:    shrdl %cl, %ebx, %edi
+; i686-NEXT:    movl 72(%esp,%edx), %esi
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl 76(%esp,%edx), %edx
+; i686-NEXT:    shrdl %cl, %edx, %esi
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    sarl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    sarl %cl, %edx
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edx, 28(%eax)
+; i686-NEXT:    movl %esi, 24(%eax)
+; i686-NEXT:    movl %ebx, 20(%eax)
 ; i686-NEXT:    movl %edi, 16(%eax)
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, 12(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, 8(%eax)
@@ -448,7 +451,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %ecx, 4(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    addl $92, %esp
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -492,94 +495,98 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-LABEL: test_shl_v2i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $128, %esp
+; i686-NEXT:    movl 40(%ebp), %edi
+; i686-NEXT:    movl 32(%ebp), %ecx
+; i686-NEXT:    movl 36(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %edx
+; i686-NEXT:    movl 8(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %esi
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 28(%ebp), %esi
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, %ebx
-; i686-NEXT:    shrl $3, %ebx
-; i686-NEXT:    andl $12, %ebx
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    subl %ebx, %edx
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 20(%ebp), %edx
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    movl %edi, %edx
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    subl %edx, %eax
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%edx), %esi
+; i686-NEXT:    movl (%eax), %esi
 ; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 4(%edx), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 8(%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 4(%eax), %edx
 ; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl %edx, %edi
 ; i686-NEXT:    shldl %cl, %esi, %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ebp
-; i686-NEXT:    shrl $3, %ebp
-; i686-NEXT:    andl $12, %ebp
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    subl %ebp, %ecx
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%eax), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %edx, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 56(%ebp), %eax
+; i686-NEXT:    movl %eax, %edx
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    subl %edx, %esi
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl (%esi), %edi
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%ecx), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 4(%ecx), %edi
-; i686-NEXT:    movl 8(%ecx), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    andl $31, %eax
-; i686-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; i686-NEXT:    movl 4(%esi), %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shldl %cl, %edi, %esi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, %eax
+; i686-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%esi), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    shldl %cl, %ebx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    negl %ebx
-; i686-NEXT:    movl 64(%esp,%ebx), %ebx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shldl %cl, %eax, %ebx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, %edx
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %eax, %edi
-; i686-NEXT:    negl %ebp
-; i686-NEXT:    movl 96(%esp,%ebp), %ebp
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    negl %ecx
+; i686-NEXT:    movl 76(%esp,%ecx), %ebx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    shldl %cl, %esi, %ebx
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    negl %edx
+; i686-NEXT:    movl 108(%esp,%edx), %edx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shldl %cl, %eax, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %ebp, 28(%eax)
-; i686-NEXT:    movl %esi, 24(%eax)
-; i686-NEXT:    movl %edi, 20(%eax)
-; i686-NEXT:    movl %edx, 16(%eax)
+; i686-NEXT:    shldl %cl, %eax, %edx
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edx, 28(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 24(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    movl %edi, 16(%eax)
 ; i686-NEXT:    movl %ebx, 12(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, 8(%eax)
@@ -587,7 +594,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl %ecx, 4(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    addl $100, %esp
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index bf159acc43f91..f015dfddcfcaf 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -8,74 +8,74 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-LABEL: shift1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $80, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $96, %esp
+; CHECK-NEXT:    movl 40(%ebp), %ecx
+; CHECK-NEXT:    movl 16(%ebp), %edx
+; CHECK-NEXT:    movl 20(%ebp), %eax
+; CHECK-NEXT:    movl 32(%ebp), %esi
+; CHECK-NEXT:    movl 28(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 12(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 24(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 8(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 36(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    sarl $31, %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    sarl $31, %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrb $5, %al
-; CHECK-NEXT:    movzbl %al, %ebp
-; CHECK-NEXT:    movl 24(%esp,%ebp,4), %eax
-; CHECK-NEXT:    movl 20(%esp,%ebp,4), %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shrdl %cl, %eax, %edx
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl 20(%esp,%eax,4), %edx
+; CHECK-NEXT:    movl 16(%esp,%eax,4), %esi
+; CHECK-NEXT:    shrdl %cl, %edx, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 24(%esp,%eax,4), %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shrdl %cl, %esi, %edx
 ; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 28(%esp,%ebp,4), %edx
-; CHECK-NEXT:    shrdl %cl, %edx, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; CHECK-NEXT:    shrdl %cl, %ebx, %edx
-; CHECK-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; CHECK-NEXT:    movl 36(%esp,%ebp,4), %edx
+; CHECK-NEXT:    movl 36(%esp,%eax,4), %ebx
+; CHECK-NEXT:    movl 32(%esp,%eax,4), %edi
+; CHECK-NEXT:    movl 40(%esp,%eax,4), %edx
+; CHECK-NEXT:    movl 28(%esp,%eax,4), %esi
+; CHECK-NEXT:    movl 44(%esp,%eax,4), %eax
+; CHECK-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    shrdl %cl, %edi, %esi
+; CHECK-NEXT:    shrdl %cl, %ebx, %edi
 ; CHECK-NEXT:    shrdl %cl, %edx, %ebx
-; CHECK-NEXT:    movl 40(%esp,%ebp,4), %eax
 ; CHECK-NEXT:    shrdl %cl, %eax, %edx
-; CHECK-NEXT:    movl 16(%esp,%ebp,4), %esi
-; CHECK-NEXT:    movl 44(%esp,%ebp,4), %ebp
-; CHECK-NEXT:    shrdl %cl, %ebp, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %edi, %esi
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %ebp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ebp, 28(%ecx)
-; CHECK-NEXT:    movl %eax, 24(%ecx)
-; CHECK-NEXT:    movl %edx, 20(%ecx)
-; CHECK-NEXT:    movl %ebx, 16(%ecx)
-; CHECK-NEXT:    movl (%esp), %eax # 4-byte Reload
-; CHECK-NEXT:    movl %eax, 12(%ecx)
+; CHECK-NEXT:    sarl %cl, %eax
+; CHECK-NEXT:    movl 72(%ebp), %ecx
+; CHECK-NEXT:    movl %eax, 28(%ecx)
+; CHECK-NEXT:    movl %edx, 24(%ecx)
+; CHECK-NEXT:    movl %ebx, 20(%ecx)
+; CHECK-NEXT:    movl %edi, 16(%ecx)
+; CHECK-NEXT:    movl %esi, 12(%ecx)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; CHECK-NEXT:    movl %eax, 8(%ecx)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; CHECK-NEXT:    movl %eax, 4(%ecx)
-; CHECK-NEXT:    movl %esi, (%ecx)
-; CHECK-NEXT:    addl $80, %esp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, (%ecx)
+; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -102,13 +102,13 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O0-NEXT:    shrb $6, %dl
 ; CHECK-X64-O0-NEXT:    movzbl %dl, %edx
 ; CHECK-X64-O0-NEXT:    movl %edx, %edi
-; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi,8), %rsi
-; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi,8), %r8
-; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT:    movq -72(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi,8), %r9
 ; CHECK-X64-O0-NEXT:    movq %r9, %rdx
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %rsi, %rdx
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    movq -40(%rsp,%rdi,8), %rdi
+; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi,8), %rdi
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %rdi, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %r9, %r8
@@ -126,30 +126,29 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O2:       # %bb.0: # %entry
 ; CHECK-X64-O2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    sarq $63, %rcx
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
 ; CHECK-X64-O2-NEXT:    shrb $6, %cl
-; CHECK-X64-O2-NEXT:    movzbl %cl, %edx
-; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rdx,8), %rsi
-; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rdx,8), %rdi
-; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rdx,8), %r9
-; CHECK-X64-O2-NEXT:    movq %r9, %r10
+; CHECK-X64-O2-NEXT:    movzbl %cl, %ecx
+; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rcx,8), %rdx
+; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rcx,8), %rsi
+; CHECK-X64-O2-NEXT:    movq -72(%rsp,%rcx,8), %rdi
+; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rcx,8), %r9
 ; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
-; CHECK-X64-O2-NEXT:    shrdq %cl, %rsi, %r10
-; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx,8), %rdx
-; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %rsi
-; CHECK-X64-O2-NEXT:    shrdq %cl, %r9, %rdi
-; CHECK-X64-O2-NEXT:    sarq %cl, %rdx
-; CHECK-X64-O2-NEXT:    movq %rdx, 24(%rax)
-; CHECK-X64-O2-NEXT:    movq %rsi, 16(%rax)
-; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %rdi
+; CHECK-X64-O2-NEXT:    shrdq %cl, %r9, %rdx
+; CHECK-X64-O2-NEXT:    shrdq %cl, %rsi, %r9
+; CHECK-X64-O2-NEXT:    sarq %cl, %rsi
+; CHECK-X64-O2-NEXT:    movq %rsi, 24(%rax)
+; CHECK-X64-O2-NEXT:    movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT:    movq %rdx, 8(%rax)
 ; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
 ; CHECK-X64-O2-NEXT:    retq
 entry:
@@ -162,17 +161,13 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-LABEL: shift2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $80, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $112, %esp
+; CHECK-NEXT:    movl 12(%ebp), %ecx
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -181,54 +176,69 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrb $3, %al
 ; CHECK-NEXT:    andb $28, %al
 ; CHECK-NEXT:    negb %al
 ; CHECK-NEXT:    movsbl %al, %eax
-; CHECK-NEXT:    movl 52(%esp,%eax), %esi
+; CHECK-NEXT:    movl 64(%esp,%eax), %esi
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 56(%esp,%eax), %edx
+; CHECK-NEXT:    movl 68(%esp,%eax), %edx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    shldl %cl, %esi, %edi
 ; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 60(%esp,%eax), %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shldl %cl, %edx, %edi
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 64(%esp,%eax), %edx
-; CHECK-NEXT:    movl %edx, %ebp
-; CHECK-NEXT:    shldl %cl, %esi, %ebp
-; CHECK-NEXT:    movl 68(%esp,%eax), %esi
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl 80(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 84(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 72(%esp,%eax), %ebx
+; CHECK-NEXT:    movl %ebx, %esi
+; CHECK-NEXT:    shldl %cl, %edx, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 88(%esp,%eax), %esi
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 76(%esp,%eax), %edi
+; CHECK-NEXT:    movl 92(%esp,%eax), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shldl %cl, %ebx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shldl %cl, %edi, %eax
+; CHECK-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, %ebx
 ; CHECK-NEXT:    shldl %cl, %edx, %ebx
-; CHECK-NEXT:    movl 72(%esp,%eax), %edi
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shldl %cl, %esi, %edx
-; CHECK-NEXT:    movl 48(%esp,%eax), %esi
-; CHECK-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; CHECK-NEXT:    movl 76(%esp,%eax), %esi
-; CHECK-NEXT:    shldl %cl, %edi, %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %esi, 28(%eax)
-; CHECK-NEXT:    movl %edx, 24(%eax)
-; CHECK-NEXT:    movl %ebx, 20(%eax)
-; CHECK-NEXT:    movl %ebp, 16(%eax)
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    shldl %cl, %eax, %edi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    shldl %cl, %esi, %eax
+; CHECK-NEXT:    movl 8(%ebp), %esi
+; CHECK-NEXT:    movl %eax, 28(%esi)
+; CHECK-NEXT:    movl %edi, 24(%esi)
+; CHECK-NEXT:    movl %ebx, 20(%esi)
+; CHECK-NEXT:    movl (%esp), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 16(%esi)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 12(%esi)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    movl %edx, 12(%eax)
+; CHECK-NEXT:    movl %edx, 8(%esi)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    movl %edx, 8(%eax)
-; CHECK-NEXT:    movl (%esp), %edi # 4-byte Reload
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    movl %edx, 4(%esi)
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %edi, %esi
-; CHECK-NEXT:    movl %esi, 4(%eax)
-; CHECK-NEXT:    movl %edx, (%eax)
-; CHECK-NEXT:    addl $80, %esp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    movl %edx, (%esi)
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -253,13 +263,13 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O0-NEXT:    andb $24, %dl
 ; CHECK-X64-O0-NEXT:    negb %dl
 ; CHECK-X64-O0-NEXT:    movsbq %dl, %r8
-; CHECK-X64-O0-NEXT:    movq -32(%rsp,%r8), %r9
-; CHECK-X64-O0-NEXT:    movq -24(%rsp,%r8), %rdx
-; CHECK-X64-O0-NEXT:    movq -16(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT:    movq -40(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT:    movq -32(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT:    movq -24(%rsp,%r8), %r10
 ; CHECK-X64-O0-NEXT:    movq %r10, %rsi
 ; CHECK-X64-O0-NEXT:    shldq %cl, %rdx, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    movq -8(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT:    movq -16(%rsp,%r8), %r8
 ; CHECK-X64-O0-NEXT:    shldq %cl, %r10, %r8
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
 ; CHECK-X64-O0-NEXT:    movq %r9, %r10
@@ -278,34 +288,32 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O2:       # %bb.0:
 ; CHECK-X64-O2-NEXT:    movq %rsi, %rcx
 ; CHECK-X64-O2-NEXT:    movq %rdi, %rax
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    xorps %xmm0, %xmm0
+; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movl %ecx, %edx
 ; CHECK-X64-O2-NEXT:    shrb $3, %dl
 ; CHECK-X64-O2-NEXT:    andb $24, %dl
 ; CHECK-X64-O2-NEXT:    negb %dl
 ; CHECK-X64-O2-NEXT:    movsbq %dl, %rdx
-; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx), %rsi
 ; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rdx), %rdx
 ; CHECK-X64-O2-NEXT:    movq %r8, %r9
-; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %r9
-; CHECK-X64-O2-NEXT:    movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT:    shldq %cl, %r8, %rdx
-; CHECK-X64-O2-NEXT:    movq %rsi, %r8
-; CHECK-X64-O2-NEXT:    shlq %cl, %r8
+; CHECK-X64-O2-NEXT:    shldq %cl, %rsi, %r9
+; CHECK-X64-O2-NEXT:    movq %rdi, %r10
+; CHECK-X64-O2-NEXT:    shldq %cl, %r8, %r10
+; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %rdx
 ; CHECK-X64-O2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-X64-O2-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT:    shlq %cl, %rsi
 ; CHECK-X64-O2-NEXT:    movq %rdx, 24(%rax)
-; CHECK-X64-O2-NEXT:    movq %r9, 16(%rax)
-; CHECK-X64-O2-NEXT:    movq %rdi, 8(%rax)
-; CHECK-X64-O2-NEXT:    movq %r8, (%rax)
+; CHECK-X64-O2-NEXT:    movq %r10, 16(%rax)
+; CHECK-X64-O2-NEXT:    movq %r9, 8(%rax)
+; CHECK-X64-O2-NEXT:    movq %rsi, (%rax)
 ; CHECK-X64-O2-NEXT:    retq
 {
   %b = shl i256 1, %c  ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index e5affd86312ef..7d12a8166d861 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -646,76 +646,784 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
 ;
-; X86-SSE2-LABEL: lshr_16bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, (%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $15, %ecx
-; X86-SSE2-NEXT:    movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT:    movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT:    movl 8(%esp,%ecx), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: lshr_16bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $32, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $15, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $32, %esp
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: lshr_16bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $32, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
-; X86-AVX-NEXT:    andl $15, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $32, %esp
-; X86-AVX-NEXT:    retl
+; FALLBACK16-LABEL: lshr_16bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $60, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl 12(%ecx), %edx
+; FALLBACK16-NEXT:    movl 8(%ecx), %esi
+; FALLBACK16-NEXT:    movl (%ecx), %edi
+; FALLBACK16-NEXT:    movl 4(%ecx), %ecx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movb %ah, %al
+; FALLBACK16-NEXT:    shlb $3, %al
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $12, %ah
+; FALLBACK16-NEXT:    movzbl %ah, %ebp
+; FALLBACK16-NEXT:    movl 20(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl 24(%esp,%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl 16(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movl 28(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl %ebx, 12(%edx)
+; FALLBACK16-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK16-NEXT:    movl %esi, (%edx)
+; FALLBACK16-NEXT:    movl %edi, 4(%edx)
+; FALLBACK16-NEXT:    addl $60, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: lshr_16bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $44, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT:    movl 12(%edx), %esi
+; FALLBACK17-NEXT:    movl (%edx), %edi
+; FALLBACK17-NEXT:    movl 4(%edx), %ebx
+; FALLBACK17-NEXT:    movl 8(%edx), %edx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, (%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $12, %ch
+; FALLBACK17-NEXT:    movzbl %ch, %edi
+; FALLBACK17-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK17-NEXT:    movl 4(%esp,%edi), %ebx
+; FALLBACK17-NEXT:    movl %ebx, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl (%esp,%edi), %ebp
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %ebp
+; FALLBACK17-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    shrl %cl, %edi
+; FALLBACK17-NEXT:    movl %esi, 8(%eax)
+; FALLBACK17-NEXT:    movl %edi, 12(%eax)
+; FALLBACK17-NEXT:    movl %ebp, (%eax)
+; FALLBACK17-NEXT:    movl %edx, 4(%eax)
+; FALLBACK17-NEXT:    addl $44, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: lshr_16bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $44, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl 12(%ecx), %edx
+; FALLBACK18-NEXT:    movl 8(%ecx), %esi
+; FALLBACK18-NEXT:    movl (%ecx), %edi
+; FALLBACK18-NEXT:    movl 4(%ecx), %ecx
+; FALLBACK18-NEXT:    movzbl (%eax), %ebx
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, (%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $12, %bl
+; FALLBACK18-NEXT:    movzbl %bl, %esi
+; FALLBACK18-NEXT:    movl 4(%esp,%esi), %edi
+; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebx
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    movl 8(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %ebx, %ecx
+; FALLBACK18-NEXT:    shrxl %eax, (%esp,%esi), %ebx
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %ebx, %edi
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT:    movl 12(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %eax
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT:    movl %eax, 12(%esi)
+; FALLBACK18-NEXT:    movl %edx, 8(%esi)
+; FALLBACK18-NEXT:    movl %edi, (%esi)
+; FALLBACK18-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK18-NEXT:    addl $44, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: lshr_16bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $44, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT:    movl 12(%edx), %esi
+; FALLBACK19-NEXT:    movl (%edx), %edi
+; FALLBACK19-NEXT:    movl 4(%edx), %ebx
+; FALLBACK19-NEXT:    movl 8(%edx), %edx
+; FALLBACK19-NEXT:    movzbl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, (%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $12, %al
+; FALLBACK19-NEXT:    movzbl %al, %eax
+; FALLBACK19-NEXT:    movl 8(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl 4(%esp,%eax), %edi
+; FALLBACK19-NEXT:    movl %edi, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl (%esp,%eax), %ebx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl 12(%esp,%eax), %eax
+; FALLBACK19-NEXT:    shrxl %ecx, %eax, %edi
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, 8(%ebp)
+; FALLBACK19-NEXT:    movl %edi, 12(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, (%ebp)
+; FALLBACK19-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK19-NEXT:    addl $44, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: lshr_16bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $60, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $12, %cl
+; FALLBACK20-NEXT:    movzbl %cl, %edi
+; FALLBACK20-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl 20(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %esi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl %edi, 12(%edx)
+; FALLBACK20-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK20-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, (%edx)
+; FALLBACK20-NEXT:    addl $60, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: lshr_16bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $44, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT:    movups (%edx), %xmm0
+; FALLBACK21-NEXT:    movzbl (%ecx), %edx
+; FALLBACK21-NEXT:    movl %edx, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK21-NEXT:    andb $12, %dl
+; FALLBACK21-NEXT:    movzbl %dl, %ebx
+; FALLBACK21-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK21-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK21-NEXT:    movl %ebp, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK21-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK21-NEXT:    movl %eax, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    shrl %cl, %edx
+; FALLBACK21-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK21-NEXT:    movl %esi, (%ebp)
+; FALLBACK21-NEXT:    addl $44, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: lshr_16bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $44, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK22-NEXT:    andb $12, %cl
+; FALLBACK22-NEXT:    movzbl %cl, %edi
+; FALLBACK22-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    notb %cl
+; FALLBACK22-NEXT:    movl 4(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK22-NEXT:    orl %ebx, %edx
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK22-NEXT:    shrxl %eax, %edi, %eax
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ebx, %edi
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT:    movl %eax, 12(%esi)
+; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %edi, 8(%esi)
+; FALLBACK22-NEXT:    movl %edx, (%esi)
+; FALLBACK22-NEXT:    addl $44, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: lshr_16bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $44, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT:    movups (%edx), %xmm0
+; FALLBACK23-NEXT:    movzbl (%ecx), %edx
+; FALLBACK23-NEXT:    movl %edx, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK23-NEXT:    andb $12, %dl
+; FALLBACK23-NEXT:    movzbl %dl, %ebx
+; FALLBACK23-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    movl %ebp, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %ebx
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK23-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK23-NEXT:    shrxl %ecx, %edx, %edx
+; FALLBACK23-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl %esi, (%ebp)
+; FALLBACK23-NEXT:    addl $44, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: lshr_16bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $60, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $12, %cl
+; FALLBACK24-NEXT:    movzbl %cl, %edi
+; FALLBACK24-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl 20(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %esi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl %edi, 12(%edx)
+; FALLBACK24-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK24-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, (%edx)
+; FALLBACK24-NEXT:    addl $60, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: lshr_16bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $44, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK25-NEXT:    movzbl (%ecx), %edx
+; FALLBACK25-NEXT:    movl %edx, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK25-NEXT:    andb $12, %dl
+; FALLBACK25-NEXT:    movzbl %dl, %ebx
+; FALLBACK25-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK25-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK25-NEXT:    movl %ebp, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK25-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK25-NEXT:    movl %eax, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    shrl %cl, %edx
+; FALLBACK25-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK25-NEXT:    movl %esi, (%ebp)
+; FALLBACK25-NEXT:    addl $44, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: lshr_16bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $44, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK26-NEXT:    andb $12, %cl
+; FALLBACK26-NEXT:    movzbl %cl, %edi
+; FALLBACK26-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    notb %cl
+; FALLBACK26-NEXT:    movl 4(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK26-NEXT:    orl %ebx, %edx
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %eax
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    orl %ebx, %edi
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT:    movl %eax, 12(%esi)
+; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %edi, 8(%esi)
+; FALLBACK26-NEXT:    movl %edx, (%esi)
+; FALLBACK26-NEXT:    addl $44, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: lshr_16bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $44, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK27-NEXT:    movzbl (%ecx), %edx
+; FALLBACK27-NEXT:    movl %edx, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK27-NEXT:    andb $12, %dl
+; FALLBACK27-NEXT:    movzbl %dl, %ebx
+; FALLBACK27-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    movl %ebp, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %ebx
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK27-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK27-NEXT:    shrxl %ecx, %edx, %edx
+; FALLBACK27-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl %esi, (%ebp)
+; FALLBACK27-NEXT:    addl $44, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: lshr_16bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $60, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $12, %cl
+; FALLBACK28-NEXT:    movzbl %cl, %edi
+; FALLBACK28-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl 20(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %esi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl %edi, 12(%edx)
+; FALLBACK28-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK28-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, (%edx)
+; FALLBACK28-NEXT:    addl $60, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: lshr_16bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $44, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK29-NEXT:    movzbl (%ecx), %edx
+; FALLBACK29-NEXT:    movl %edx, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK29-NEXT:    andb $12, %dl
+; FALLBACK29-NEXT:    movzbl %dl, %ebx
+; FALLBACK29-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK29-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK29-NEXT:    movl %ebp, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK29-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK29-NEXT:    movl %eax, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    shrl %cl, %edx
+; FALLBACK29-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK29-NEXT:    movl %esi, (%ebp)
+; FALLBACK29-NEXT:    addl $44, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: lshr_16bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $44, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK30-NEXT:    andb $12, %cl
+; FALLBACK30-NEXT:    movzbl %cl, %edi
+; FALLBACK30-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    notb %cl
+; FALLBACK30-NEXT:    movl 4(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK30-NEXT:    orl %ebx, %edx
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %eax
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    orl %ebx, %edi
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT:    movl %eax, 12(%esi)
+; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %edi, 8(%esi)
+; FALLBACK30-NEXT:    movl %edx, (%esi)
+; FALLBACK30-NEXT:    addl $44, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: lshr_16bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $44, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK31-NEXT:    movzbl (%ecx), %edx
+; FALLBACK31-NEXT:    movl %edx, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK31-NEXT:    andb $12, %dl
+; FALLBACK31-NEXT:    movzbl %dl, %ebx
+; FALLBACK31-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    movl %ebp, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %ebx
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK31-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK31-NEXT:    shrxl %ecx, %edx, %edx
+; FALLBACK31-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl %esi, (%ebp)
+; FALLBACK31-NEXT:    addl $44, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -800,82 +1508,796 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
 ;
-; X86-SSE2-LABEL: shl_16bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, (%esp)
-; X86-SSE2-NEXT:    andb $15, %cl
-; X86-SSE2-NEXT:    negb %cl
-; X86-SSE2-NEXT:    movsbl %cl, %ecx
-; X86-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
-; X86-SSE2-NEXT:    movl 20(%esp,%ecx), %esi
-; X86-SSE2-NEXT:    movl 28(%esp,%ecx), %edi
-; X86-SSE2-NEXT:    movl 24(%esp,%ecx), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: shl_16bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $32, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, (%esp)
-; X86-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andb $15, %cl
-; X86-SSE42-NEXT:    negb %cl
-; X86-SSE42-NEXT:    movsbl %cl, %ecx
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $32, %esp
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: shl_16bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $32, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, (%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andb $15, %cl
-; X86-AVX-NEXT:    negb %cl
-; X86-AVX-NEXT:    movsbl %cl, %ecx
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $32, %esp
-; X86-AVX-NEXT:    retl
+; FALLBACK16-LABEL: shl_16bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $60, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl 8(%ecx), %ebx
+; FALLBACK16-NEXT:    movl 12(%ecx), %esi
+; FALLBACK16-NEXT:    movl (%ecx), %edi
+; FALLBACK16-NEXT:    movl 4(%ecx), %ecx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movb %ah, %dh
+; FALLBACK16-NEXT:    shlb $3, %dh
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $12, %ah
+; FALLBACK16-NEXT:    negb %ah
+; FALLBACK16-NEXT:    movsbl %ah, %ebp
+; FALLBACK16-NEXT:    movl 36(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movb %dh, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl 32(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 40(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %esi, 8(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 12(%eax)
+; FALLBACK16-NEXT:    movl %edi, 4(%eax)
+; FALLBACK16-NEXT:    addl $60, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: shl_16bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $44, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT:    movl 12(%edx), %esi
+; FALLBACK17-NEXT:    movl 8(%edx), %edi
+; FALLBACK17-NEXT:    movl (%edx), %ebx
+; FALLBACK17-NEXT:    movl 4(%edx), %edx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $12, %ch
+; FALLBACK17-NEXT:    negb %ch
+; FALLBACK17-NEXT:    movsbl %ch, %edi
+; FALLBACK17-NEXT:    movl 16(%esp,%edi), %edx
+; FALLBACK17-NEXT:    movl 20(%esp,%edi), %ebx
+; FALLBACK17-NEXT:    movl %ebx, %esi
+; FALLBACK17-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl 24(%esp,%edi), %ebp
+; FALLBACK17-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK17-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %ebp
+; FALLBACK17-NEXT:    shll %cl, %edx
+; FALLBACK17-NEXT:    movl %ebp, 8(%eax)
+; FALLBACK17-NEXT:    movl %edi, 12(%eax)
+; FALLBACK17-NEXT:    movl %edx, (%eax)
+; FALLBACK17-NEXT:    movl %esi, 4(%eax)
+; FALLBACK17-NEXT:    addl $44, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: shl_16bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $44, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl 8(%ecx), %edx
+; FALLBACK18-NEXT:    movl 12(%ecx), %esi
+; FALLBACK18-NEXT:    movl (%ecx), %edi
+; FALLBACK18-NEXT:    movl 4(%ecx), %ecx
+; FALLBACK18-NEXT:    movzbl (%eax), %eax
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    shlb $3, %bl
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $12, %al
+; FALLBACK18-NEXT:    negb %al
+; FALLBACK18-NEXT:    movsbl %al, %edx
+; FALLBACK18-NEXT:    movl 20(%esp,%edx), %ecx
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT:    movl 16(%esp,%edx), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    notb %al
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
+; FALLBACK18-NEXT:    movl 24(%esp,%edx), %edx
+; FALLBACK18-NEXT:    shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT:    shrl %edx
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edx
+; FALLBACK18-NEXT:    orl %esi, %edx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %eax, %ecx, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl %ebp, (%ecx)
+; FALLBACK18-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK18-NEXT:    movl %edx, 12(%ecx)
+; FALLBACK18-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK18-NEXT:    addl $44, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: shl_16bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $44, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT:    movl 12(%edx), %esi
+; FALLBACK19-NEXT:    movl 8(%edx), %edi
+; FALLBACK19-NEXT:    movl (%edx), %ebx
+; FALLBACK19-NEXT:    movl 4(%edx), %edx
+; FALLBACK19-NEXT:    movzbl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $12, %al
+; FALLBACK19-NEXT:    negb %al
+; FALLBACK19-NEXT:    movsbl %al, %eax
+; FALLBACK19-NEXT:    movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT:    movl 20(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, %edx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK19-NEXT:    movl 24(%esp,%eax), %ebx
+; FALLBACK19-NEXT:    movl 28(%esp,%eax), %eax
+; FALLBACK19-NEXT:    shldl %cl, %ebx, %eax
+; FALLBACK19-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, 8(%ebp)
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    movl %edi, (%ebp)
+; FALLBACK19-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK19-NEXT:    addl $44, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: shl_16bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $60, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $12, %cl
+; FALLBACK20-NEXT:    negb %cl
+; FALLBACK20-NEXT:    movsbl %cl, %edi
+; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    movl 32(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 36(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    movl %eax, (%edx)
+; FALLBACK20-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK20-NEXT:    movl %edi, 8(%edx)
+; FALLBACK20-NEXT:    movl %esi, 12(%edx)
+; FALLBACK20-NEXT:    addl $60, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: shl_16bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $44, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT:    movups (%edx), %xmm0
+; FALLBACK21-NEXT:    movzbl (%ecx), %edx
+; FALLBACK21-NEXT:    movl %edx, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK21-NEXT:    movaps %xmm1, (%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $12, %dl
+; FALLBACK21-NEXT:    negb %dl
+; FALLBACK21-NEXT:    movsbl %dl, %edi
+; FALLBACK21-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK21-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK21-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK21-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %ebx, %ebp
+; FALLBACK21-NEXT:    shll %cl, %ebp
+; FALLBACK21-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT:    movl %edi, 4(%eax)
+; FALLBACK21-NEXT:    movl %esi, 8(%eax)
+; FALLBACK21-NEXT:    movl %edx, 12(%eax)
+; FALLBACK21-NEXT:    movl %ebp, (%eax)
+; FALLBACK21-NEXT:    addl $44, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: shl_16bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $44, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK22-NEXT:    movaps %xmm1, (%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $12, %cl
+; FALLBACK22-NEXT:    negb %cl
+; FALLBACK22-NEXT:    movsbl %cl, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 24(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    shlxl %eax, %edx, %edi
+; FALLBACK22-NEXT:    movl %eax, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrl %edx
+; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT:    orl %esi, %edx
+; FALLBACK22-NEXT:    movl 20(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl %esi, %ebp
+; FALLBACK22-NEXT:    shrl %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    movl 16(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %eax
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT:    movl %eax, (%esi)
+; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK22-NEXT:    movl %edx, 12(%esi)
+; FALLBACK22-NEXT:    addl $44, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: shl_16bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $44, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT:    movups (%edx), %xmm0
+; FALLBACK23-NEXT:    movzbl (%ecx), %edx
+; FALLBACK23-NEXT:    movl %edx, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK23-NEXT:    movaps %xmm1, (%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $12, %dl
+; FALLBACK23-NEXT:    negb %dl
+; FALLBACK23-NEXT:    movsbl %dl, %edi
+; FALLBACK23-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK23-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK23-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK23-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK23-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK23-NEXT:    movl %edi, 4(%eax)
+; FALLBACK23-NEXT:    movl %esi, 8(%eax)
+; FALLBACK23-NEXT:    movl %edx, 12(%eax)
+; FALLBACK23-NEXT:    movl %ebp, (%eax)
+; FALLBACK23-NEXT:    addl $44, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: shl_16bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $60, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $12, %cl
+; FALLBACK24-NEXT:    negb %cl
+; FALLBACK24-NEXT:    movsbl %cl, %edi
+; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    movl 32(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 36(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    movl %eax, (%edx)
+; FALLBACK24-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK24-NEXT:    movl %edi, 8(%edx)
+; FALLBACK24-NEXT:    movl %esi, 12(%edx)
+; FALLBACK24-NEXT:    addl $60, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: shl_16bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $44, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK25-NEXT:    movzbl (%ecx), %edx
+; FALLBACK25-NEXT:    movl %edx, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $12, %dl
+; FALLBACK25-NEXT:    negb %dl
+; FALLBACK25-NEXT:    movsbl %dl, %edi
+; FALLBACK25-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK25-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK25-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK25-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %ebx, %ebp
+; FALLBACK25-NEXT:    shll %cl, %ebp
+; FALLBACK25-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT:    movl %edi, 4(%eax)
+; FALLBACK25-NEXT:    movl %esi, 8(%eax)
+; FALLBACK25-NEXT:    movl %edx, 12(%eax)
+; FALLBACK25-NEXT:    movl %ebp, (%eax)
+; FALLBACK25-NEXT:    addl $44, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: shl_16bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $44, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $12, %cl
+; FALLBACK26-NEXT:    negb %cl
+; FALLBACK26-NEXT:    movsbl %cl, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl 24(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    shlxl %eax, %edx, %edi
+; FALLBACK26-NEXT:    movl %eax, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrl %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT:    orl %esi, %edx
+; FALLBACK26-NEXT:    movl 20(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl %esi, %ebp
+; FALLBACK26-NEXT:    shrl %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    movl 16(%esp,%ecx), %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %eax
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT:    movl %eax, (%esi)
+; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK26-NEXT:    movl %edx, 12(%esi)
+; FALLBACK26-NEXT:    addl $44, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: shl_16bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $44, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK27-NEXT:    movzbl (%ecx), %edx
+; FALLBACK27-NEXT:    movl %edx, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $12, %dl
+; FALLBACK27-NEXT:    negb %dl
+; FALLBACK27-NEXT:    movsbl %dl, %edi
+; FALLBACK27-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK27-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK27-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK27-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK27-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK27-NEXT:    movl %edi, 4(%eax)
+; FALLBACK27-NEXT:    movl %esi, 8(%eax)
+; FALLBACK27-NEXT:    movl %edx, 12(%eax)
+; FALLBACK27-NEXT:    movl %ebp, (%eax)
+; FALLBACK27-NEXT:    addl $44, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: shl_16bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $60, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $12, %cl
+; FALLBACK28-NEXT:    negb %cl
+; FALLBACK28-NEXT:    movsbl %cl, %edi
+; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    movl 32(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 36(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    movl %eax, (%edx)
+; FALLBACK28-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK28-NEXT:    movl %edi, 8(%edx)
+; FALLBACK28-NEXT:    movl %esi, 12(%edx)
+; FALLBACK28-NEXT:    addl $60, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: shl_16bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $44, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK29-NEXT:    movzbl (%ecx), %edx
+; FALLBACK29-NEXT:    movl %edx, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $12, %dl
+; FALLBACK29-NEXT:    negb %dl
+; FALLBACK29-NEXT:    movsbl %dl, %edi
+; FALLBACK29-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK29-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK29-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK29-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %ebx, %ebp
+; FALLBACK29-NEXT:    shll %cl, %ebp
+; FALLBACK29-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT:    movl %edi, 4(%eax)
+; FALLBACK29-NEXT:    movl %esi, 8(%eax)
+; FALLBACK29-NEXT:    movl %edx, 12(%eax)
+; FALLBACK29-NEXT:    movl %ebp, (%eax)
+; FALLBACK29-NEXT:    addl $44, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: shl_16bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $44, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $12, %cl
+; FALLBACK30-NEXT:    negb %cl
+; FALLBACK30-NEXT:    movsbl %cl, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl 24(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    shlxl %eax, %edx, %edi
+; FALLBACK30-NEXT:    movl %eax, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrl %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT:    orl %esi, %edx
+; FALLBACK30-NEXT:    movl 20(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl %esi, %ebp
+; FALLBACK30-NEXT:    shrl %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    movl 16(%esp,%ecx), %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %eax
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT:    movl %eax, (%esi)
+; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK30-NEXT:    movl %edx, 12(%esi)
+; FALLBACK30-NEXT:    addl $44, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: shl_16bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $44, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK31-NEXT:    movzbl (%ecx), %edx
+; FALLBACK31-NEXT:    movl %edx, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $12, %dl
+; FALLBACK31-NEXT:    negb %dl
+; FALLBACK31-NEXT:    movsbl %dl, %edi
+; FALLBACK31-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK31-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK31-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK31-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK31-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK31-NEXT:    movl %edi, 4(%eax)
+; FALLBACK31-NEXT:    movl %esi, 8(%eax)
+; FALLBACK31-NEXT:    movl %edx, 12(%eax)
+; FALLBACK31-NEXT:    movl %ebp, (%eax)
+; FALLBACK31-NEXT:    addl $44, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -960,107 +2382,226 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
 ;
-; X86-SSE2-LABEL: ashr_16bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, (%esp)
-; X86-SSE2-NEXT:    sarl $31, %edx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $15, %ecx
-; X86-SSE2-NEXT:    movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT:    movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT:    movl 8(%esp,%ecx), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: ashr_16bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $32, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movl (%edx), %esi
-; X86-SSE42-NEXT:    movl 4(%edx), %edi
-; X86-SSE42-NEXT:    movl 8(%edx), %ebx
-; X86-SSE42-NEXT:    movl 12(%edx), %edx
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $15, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $32, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: ashr_16bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $32, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl (%edx), %esi
-; X86-AVX-NEXT:    movl 4(%edx), %edi
-; X86-AVX-NEXT:    movl 8(%edx), %ebx
-; X86-AVX-NEXT:    movl 12(%edx), %edx
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $15, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $32, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
-; X86-AVX-NEXT:    retl
+; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-NO-BMI2:       # %bb.0:
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 8(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 12(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    notb %dl
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 24(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 16(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    retl
+;
+; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%edx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    andb $12, %dl
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl %dl, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%esp,%edi), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esp,%edi), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
+;
+; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %cl
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %dl
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    andb $12, %dl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl %dl, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%edi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esp,%edi), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -1070,172 +2611,1944 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_32bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
-; X64-SSE2-NEXT:    movzbl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $31, %esi
-; X64-SSE2-NEXT:    movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rax, (%rdx)
-; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE42-LABEL: lshr_32bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT:    movzbl (%rsi), %eax
-; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $31, %eax
-; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
-;
-; X64-AVX-LABEL: lshr_32bytes:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    movzbl (%rsi), %eax
-; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $31, %eax
-; X64-AVX-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    vzeroupper
-; X64-AVX-NEXT:    retq
-;
-; X86-SSE2-LABEL: lshr_32bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $72, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-SSE2-NEXT:    movl 12(%eax), %edi
-; X86-SSE2-NEXT:    movl 16(%eax), %ebx
-; X86-SSE2-NEXT:    movl 20(%eax), %ebp
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $31, %eax
-; X86-SSE2-NEXT:    movl 8(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl %edx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $72, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: lshr_32bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $64, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $31, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $64, %esp
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: lshr_32bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $64, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX-NEXT:    andl $31, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $64, %esp
-; X86-AVX-NEXT:    vzeroupper
-; X86-AVX-NEXT:    retl
+; FALLBACK0-LABEL: lshr_32bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK0-NEXT:    movq (%rdi), %r8
+; FALLBACK0-NEXT:    movq 8(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    movzbl %sil, %r9d
+; FALLBACK0-NEXT:    movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT:    movq -40(%rsp,%r9), %r8
+; FALLBACK0-NEXT:    movq %rdi, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -64(%rsp,%r9), %r11
+; FALLBACK0-NEXT:    movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    orq %r10, %r9
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    addq %rdi, %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r11, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r8
+; FALLBACK0-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, (%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: lshr_32bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq 24(%rdi), %rax
+; FALLBACK1-NEXT:    movq (%rdi), %r8
+; FALLBACK1-NEXT:    movq 8(%rdi), %r9
+; FALLBACK1-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    movzbl %sil, %eax
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %r8, %rsi
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %rdi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shrq %cl, %rax
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: lshr_32bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK2-NEXT:    movq (%rdi), %r8
+; FALLBACK2-NEXT:    movq 8(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    movzbl %sil, %ecx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %r9
+; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    addq %r9, %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %r9
+; FALLBACK2-NEXT:    orq %r8, %r9
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %rcx, %rsi
+; FALLBACK2-NEXT:    leaq (%rdi,%rdi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, (%rdx)
+; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: lshr_32bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq 24(%rdi), %rax
+; FALLBACK3-NEXT:    movq (%rdi), %r8
+; FALLBACK3-NEXT:    movq 8(%rdi), %r9
+; FALLBACK3-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    movzbl %sil, %eax
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %r8, %rsi
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %rdi
+; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: lshr_32bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $24, %cl
+; FALLBACK4-NEXT:    movzbl %cl, %r9d
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r11, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: lshr_32bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movzbl (%rsi), %eax
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $24, %al
+; FALLBACK5-NEXT:    movzbl %al, %eax
+; FALLBACK5-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq %rdi, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    movq %rax, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shrq %cl, %rsi
+; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: lshr_32bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $24, %cl
+; FALLBACK6-NEXT:    movzbl %cl, %ecx
+; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r9, %rcx
+; FALLBACK6-NEXT:    addq %r8, %r8
+; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: lshr_32bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movzbl (%rsi), %eax
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $24, %al
+; FALLBACK7-NEXT:    movzbl %al, %eax
+; FALLBACK7-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq %rdi, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    movq %rax, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: lshr_32bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $24, %cl
+; FALLBACK8-NEXT:    movzbl %cl, %r9d
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r11, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: lshr_32bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    movzbl (%rsi), %eax
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $24, %al
+; FALLBACK9-NEXT:    movzbl %al, %eax
+; FALLBACK9-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    movq %rax, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shrq %cl, %rsi
+; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: lshr_32bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $24, %cl
+; FALLBACK10-NEXT:    movzbl %cl, %ecx
+; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r9, %rcx
+; FALLBACK10-NEXT:    addq %r8, %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: lshr_32bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    movzbl (%rsi), %eax
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $24, %al
+; FALLBACK11-NEXT:    movzbl %al, %eax
+; FALLBACK11-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq %rdi, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    movq %rax, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: lshr_32bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $24, %cl
+; FALLBACK12-NEXT:    movzbl %cl, %r9d
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r11, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: lshr_32bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    movzbl (%rsi), %eax
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $24, %al
+; FALLBACK13-NEXT:    movzbl %al, %eax
+; FALLBACK13-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    movq %rax, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shrq %cl, %rsi
+; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: lshr_32bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $24, %cl
+; FALLBACK14-NEXT:    movzbl %cl, %ecx
+; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r9, %rcx
+; FALLBACK14-NEXT:    addq %r8, %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: lshr_32bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    movzbl (%rsi), %eax
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $24, %al
+; FALLBACK15-NEXT:    movzbl %al, %eax
+; FALLBACK15-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    movq %rax, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: lshr_32bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl 12(%eax), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%eax), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %esi
+; FALLBACK16-NEXT:    movl 8(%eax), %edi
+; FALLBACK16-NEXT:    movl 24(%eax), %ebx
+; FALLBACK16-NEXT:    movb (%ecx), %ch
+; FALLBACK16-NEXT:    movl (%eax), %ebp
+; FALLBACK16-NEXT:    movl 4(%eax), %edx
+; FALLBACK16-NEXT:    movl 20(%eax), %eax
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movb %ch, %dl
+; FALLBACK16-NEXT:    shlb $3, %dl
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %ch
+; FALLBACK16-NEXT:    movzbl %ch, %ebx
+; FALLBACK16-NEXT:    movl 36(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    movb %dl, %ch
+; FALLBACK16-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movl 40(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %eax, %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebx, %eax
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 52(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl 56(%esp,%eax), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 60(%esp,%ebx), %ebx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %ebx, 28(%ecx)
+; FALLBACK16-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 16(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 20(%ecx)
+; FALLBACK16-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, (%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK16-NEXT:    addl $108, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: lshr_32bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $92, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl 12(%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%ebp), %esi
+; FALLBACK17-NEXT:    movl (%ebp), %edi
+; FALLBACK17-NEXT:    movl 4(%ebp), %ebx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movl 20(%ebp), %edx
+; FALLBACK17-NEXT:    movl 8(%ebp), %eax
+; FALLBACK17-NEXT:    movl 24(%ebp), %ebp
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $28, %ch
+; FALLBACK17-NEXT:    movzbl %ch, %eax
+; FALLBACK17-NEXT:    movl 24(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 20(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl 36(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%esp,%eax), %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 28(%esp,%eax), %ebp
+; FALLBACK17-NEXT:    movl 44(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %ebp, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, %ebp
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %ebp
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl %edi, 24(%eax)
+; FALLBACK17-NEXT:    shrl %cl, %edx
+; FALLBACK17-NEXT:    movl %edx, 28(%eax)
+; FALLBACK17-NEXT:    movl %ebx, 16(%eax)
+; FALLBACK17-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK17-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK17-NEXT:    movl %esi, 12(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, (%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK17-NEXT:    addl $92, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: lshr_32bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $108, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %esi
+; FALLBACK18-NEXT:    movl 8(%eax), %edi
+; FALLBACK18-NEXT:    movl 24(%eax), %edx
+; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK18-NEXT:    movl (%eax), %ebp
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl 20(%eax), %eax
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $28, %bl
+; FALLBACK18-NEXT:    movzbl %bl, %esi
+; FALLBACK18-NEXT:    movl 36(%esp,%esi), %edi
+; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebx
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    movl 40(%esp,%esi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %ecx, 32(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%esp,%esi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %edi
+; FALLBACK18-NEXT:    movl 44(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    movl %ecx, %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %ebx
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    orl %eax, %esi
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK18-NEXT:    movl %esi, 24(%eax)
+; FALLBACK18-NEXT:    movl %edi, 16(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $108, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: lshr_32bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $92, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl 12(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%ecx), %esi
+; FALLBACK19-NEXT:    movl (%ecx), %edi
+; FALLBACK19-NEXT:    movl 4(%ecx), %ebp
+; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK19-NEXT:    movl 20(%ecx), %edx
+; FALLBACK19-NEXT:    movl 8(%ecx), %eax
+; FALLBACK19-NEXT:    movl 24(%ecx), %ecx
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $28, %bl
+; FALLBACK19-NEXT:    movzbl %bl, %eax
+; FALLBACK19-NEXT:    movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl 40(%esp,%eax), %ebp
+; FALLBACK19-NEXT:    movl 20(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 36(%esp,%eax), %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl %edx, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%esp,%eax), %ebx
+; FALLBACK19-NEXT:    movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 44(%esp,%eax), %eax
+; FALLBACK19-NEXT:    movl %edx, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT:    shrxl %ecx, %eax, %edx
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl %ebp, 24(%ecx)
+; FALLBACK19-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK19-NEXT:    movl %ebx, 16(%ecx)
+; FALLBACK19-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK19-NEXT:    movl %edi, 12(%ecx)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, (%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK19-NEXT:    addl $92, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: lshr_32bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $108, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $28, %cl
+; FALLBACK20-NEXT:    movzbl %cl, %ecx
+; FALLBACK20-NEXT:    movl 32(%esp,%ecx), %esi
+; FALLBACK20-NEXT:    movl 36(%esp,%ecx), %ebx
+; FALLBACK20-NEXT:    movl %ecx, %edi
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %ebp, %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebp, %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK20-NEXT:    movl %esi, 4(%eax)
+; FALLBACK20-NEXT:    movl %edi, 24(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    addl $108, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: lshr_32bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $108, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movzbl (%eax), %eax
+; FALLBACK21-NEXT:    movl %eax, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $28, %al
+; FALLBACK21-NEXT:    movzbl %al, %ebp
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl %edi, %esi
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    shrl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %edx, (%ebp)
+; FALLBACK21-NEXT:    addl $108, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: lshr_32bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $108, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %edx
+; FALLBACK22-NEXT:    shlb $3, %dl
+; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $28, %cl
+; FALLBACK22-NEXT:    movzbl %cl, %edi
+; FALLBACK22-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %edx, %eax
+; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT:    movl %eax, %ebp
+; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK22-NEXT:    orl %ebx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK22-NEXT:    orl %ebx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT:    shlxl %ebp, %ebx, %eax
+; FALLBACK22-NEXT:    movl %ebp, %ecx
+; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK22-NEXT:    orl %ebp, %ebx
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    movl %ecx, %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %eax, 28(%edx)
+; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK22-NEXT:    movl %edi, 24(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, (%edx)
+; FALLBACK22-NEXT:    addl $108, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: lshr_32bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $108, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movzbl (%eax), %eax
+; FALLBACK23-NEXT:    movl %eax, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $28, %al
+; FALLBACK23-NEXT:    movzbl %al, %ebx
+; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK23-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK23-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK23-NEXT:    movl %esi, 16(%eax)
+; FALLBACK23-NEXT:    movl %edi, 20(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    movl %esi, 8(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    movl %esi, 12(%eax)
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, (%eax)
+; FALLBACK23-NEXT:    addl $108, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: lshr_32bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $108, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $28, %cl
+; FALLBACK24-NEXT:    movzbl %cl, %ecx
+; FALLBACK24-NEXT:    movl 32(%esp,%ecx), %esi
+; FALLBACK24-NEXT:    movl 36(%esp,%ecx), %ebx
+; FALLBACK24-NEXT:    movl %ecx, %edi
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %ebp, %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebp, %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK24-NEXT:    movl %esi, 4(%eax)
+; FALLBACK24-NEXT:    movl %edi, 24(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    addl $108, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: lshr_32bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $108, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    movzbl (%eax), %eax
+; FALLBACK25-NEXT:    movl %eax, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $28, %al
+; FALLBACK25-NEXT:    movzbl %al, %ebp
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl %edi, %esi
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    shrl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %edx, (%ebp)
+; FALLBACK25-NEXT:    addl $108, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: lshr_32bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $108, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %edx
+; FALLBACK26-NEXT:    shlb $3, %dl
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $28, %cl
+; FALLBACK26-NEXT:    movzbl %cl, %edi
+; FALLBACK26-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %edx, %eax
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT:    movl %eax, %ebp
+; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK26-NEXT:    orl %ebx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK26-NEXT:    orl %ebx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT:    shlxl %ebp, %ebx, %eax
+; FALLBACK26-NEXT:    movl %ebp, %ecx
+; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK26-NEXT:    orl %ebp, %ebx
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    movl %ecx, %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %eax, 28(%edx)
+; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK26-NEXT:    movl %edi, 24(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, (%edx)
+; FALLBACK26-NEXT:    addl $108, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: lshr_32bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $108, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    movzbl (%eax), %eax
+; FALLBACK27-NEXT:    movl %eax, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $28, %al
+; FALLBACK27-NEXT:    movzbl %al, %ebx
+; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK27-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK27-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK27-NEXT:    movl %esi, 16(%eax)
+; FALLBACK27-NEXT:    movl %edi, 20(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    movl %esi, 8(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    movl %esi, 12(%eax)
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, (%eax)
+; FALLBACK27-NEXT:    addl $108, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: lshr_32bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $108, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $28, %cl
+; FALLBACK28-NEXT:    movzbl %cl, %ecx
+; FALLBACK28-NEXT:    movl 32(%esp,%ecx), %esi
+; FALLBACK28-NEXT:    movl 36(%esp,%ecx), %ebx
+; FALLBACK28-NEXT:    movl %ecx, %edi
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %ebp, %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebp, %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK28-NEXT:    movl %esi, 4(%eax)
+; FALLBACK28-NEXT:    movl %edi, 24(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    addl $108, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: lshr_32bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $108, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT:    movzbl (%eax), %eax
+; FALLBACK29-NEXT:    movl %eax, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $28, %al
+; FALLBACK29-NEXT:    movzbl %al, %ebp
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl %edi, %esi
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    shrl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %edx, (%ebp)
+; FALLBACK29-NEXT:    addl $108, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: lshr_32bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $108, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %edx
+; FALLBACK30-NEXT:    shlb $3, %dl
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $28, %cl
+; FALLBACK30-NEXT:    movzbl %cl, %edi
+; FALLBACK30-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %edx, %eax
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT:    movl %eax, %ebp
+; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK30-NEXT:    orl %ebx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK30-NEXT:    orl %ebx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT:    shlxl %ebp, %ebx, %eax
+; FALLBACK30-NEXT:    movl %ebp, %ecx
+; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK30-NEXT:    orl %ebp, %ebx
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    movl %ecx, %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %eax, 28(%edx)
+; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK30-NEXT:    movl %edi, 24(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, (%edx)
+; FALLBACK30-NEXT:    addl $108, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: lshr_32bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $108, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT:    movzbl (%eax), %eax
+; FALLBACK31-NEXT:    movl %eax, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $28, %al
+; FALLBACK31-NEXT:    movzbl %al, %ebx
+; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK31-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK31-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK31-NEXT:    movl %esi, 16(%eax)
+; FALLBACK31-NEXT:    movl %edi, 20(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    movl %esi, 8(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    movl %esi, 12(%eax)
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, (%eax)
+; FALLBACK31-NEXT:    addl $108, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -1244,184 +4557,1973 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_32bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
-; X64-SSE2-NEXT:    movzbl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andb $31, %sil
-; X64-SSE2-NEXT:    negb %sil
-; X64-SSE2-NEXT:    movsbq %sil, %rax
-; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %rcx
-; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %rdi
-; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
-; X64-SSE2-NEXT:    movq %rax, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rcx, (%rdx)
-; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE42-LABEL: shl_32bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT:    movzbl (%rsi), %eax
-; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andb $31, %al
-; X64-SSE42-NEXT:    negb %al
-; X64-SSE42-NEXT:    movsbq %al, %rax
-; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
-;
-; X64-AVX-LABEL: shl_32bytes:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    movzbl (%rsi), %eax
-; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andb $31, %al
-; X64-AVX-NEXT:    negb %al
-; X64-AVX-NEXT:    movsbq %al, %rax
-; X64-AVX-NEXT:    vmovups -32(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -16(%rsp,%rax), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    vzeroupper
-; X64-AVX-NEXT:    retq
-;
-; X86-SSE2-LABEL: shl_32bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $72, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl (%edi), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%edi), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%edi), %esi
-; X86-SSE2-NEXT:    movl 12(%edi), %ebx
-; X86-SSE2-NEXT:    movl 16(%edi), %ebp
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-SSE2-NEXT:    movl 20(%edi), %edx
-; X86-SSE2-NEXT:    movl 24(%edi), %ecx
-; X86-SSE2-NEXT:    movl 28(%edi), %edi
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andb $31, %al
-; X86-SSE2-NEXT:    negb %al
-; X86-SSE2-NEXT:    movsbl %al, %edx
-; X86-SSE2-NEXT:    movl 40(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%edx), %esi
-; X86-SSE2-NEXT:    movl 48(%esp,%edx), %edi
-; X86-SSE2-NEXT:    movl 60(%esp,%edx), %ebx
-; X86-SSE2-NEXT:    movl 56(%esp,%edx), %ebp
-; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ecx
-; X86-SSE2-NEXT:    movl 64(%esp,%edx), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %edx, 24(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $72, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: shl_32bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $64, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, (%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andb $31, %cl
-; X86-SSE42-NEXT:    negb %cl
-; X86-SSE42-NEXT:    movsbl %cl, %ecx
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $64, %esp
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: shl_32bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $64, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
-; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andb $31, %cl
-; X86-AVX-NEXT:    negb %cl
-; X86-AVX-NEXT:    movsbl %cl, %ecx
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $64, %esp
-; X86-AVX-NEXT:    vzeroupper
-; X86-AVX-NEXT:    retl
+; FALLBACK0-LABEL: shl_32bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK0-NEXT:    movq (%rdi), %r8
+; FALLBACK0-NEXT:    movq 8(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    negb %sil
+; FALLBACK0-NEXT:    movsbq %sil, %r8
+; FALLBACK0-NEXT:    movq -24(%rsp,%r8), %rdi
+; FALLBACK0-NEXT:    movq -8(%rsp,%r8), %r10
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK0-NEXT:    movq -16(%rsp,%r8), %rbx
+; FALLBACK0-NEXT:    movq %r9, %r8
+; FALLBACK0-NEXT:    shrq %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r8
+; FALLBACK0-NEXT:    orq %r11, %r8
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    movq %rbx, %r11
+; FALLBACK0-NEXT:    shrq %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    orq %r10, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    shrq %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    orq %rbx, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    movq %r9, (%rdx)
+; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: shl_32bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq 24(%rdi), %rax
+; FALLBACK1-NEXT:    movq (%rdi), %r8
+; FALLBACK1-NEXT:    movq 8(%rdi), %r9
+; FALLBACK1-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    negb %sil
+; FALLBACK1-NEXT:    movsbq %sil, %rax
+; FALLBACK1-NEXT:    movq -40(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -24(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -32(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq -16(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shldq %cl, %rsi, %r9
+; FALLBACK1-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK1-NEXT:    shldq %cl, %r8, %rdi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shlq %cl, %rsi
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: shl_32bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK2-NEXT:    movq (%rdi), %r8
+; FALLBACK2-NEXT:    movq 8(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    negb %sil
+; FALLBACK2-NEXT:    movsbq %sil, %rsi
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rdi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r8, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %rdi, %r8
+; FALLBACK2-NEXT:    shrq %r9
+; FALLBACK2-NEXT:    shrxq %rax, %r9, %rdi
+; FALLBACK2-NEXT:    orq %rsi, %rdi
+; FALLBACK2-NEXT:    shrq %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: shl_32bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq 24(%rdi), %rax
+; FALLBACK3-NEXT:    movq (%rdi), %r8
+; FALLBACK3-NEXT:    movq 8(%rdi), %r9
+; FALLBACK3-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    negb %sil
+; FALLBACK3-NEXT:    movsbq %sil, %rax
+; FALLBACK3-NEXT:    movq -40(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -24(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -32(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq -16(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shldq %cl, %rsi, %r9
+; FALLBACK3-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK3-NEXT:    shldq %cl, %r8, %rdi
+; FALLBACK3-NEXT:    shlxq %rcx, %rsi, %rcx
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rcx, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: shl_32bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $24, %cl
+; FALLBACK4-NEXT:    negb %cl
+; FALLBACK4-NEXT:    movsbq %cl, %r8
+; FALLBACK4-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq %r10, %rdi
+; FALLBACK4-NEXT:    shrq %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r9, %rdi
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT:    movq %r8, %r11
+; FALLBACK4-NEXT:    shrq %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    orq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r8
+; FALLBACK4-NEXT:    movq %r9, %r10
+; FALLBACK4-NEXT:    shrq %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, (%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: shl_32bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movzbl (%rsi), %eax
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $24, %al
+; FALLBACK5-NEXT:    negb %al
+; FALLBACK5-NEXT:    movsbq %al, %rax
+; FALLBACK5-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT:    movq %r8, %r9
+; FALLBACK5-NEXT:    shlq %cl, %r9
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK5-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: shl_32bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $24, %cl
+; FALLBACK6-NEXT:    negb %cl
+; FALLBACK6-NEXT:    movsbq %cl, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    shrq %rdi
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    shrq %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r8, %rcx
+; FALLBACK6-NEXT:    shrq %r9
+; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: shl_32bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movzbl (%rsi), %eax
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $24, %al
+; FALLBACK7-NEXT:    negb %al
+; FALLBACK7-NEXT:    movsbq %al, %rax
+; FALLBACK7-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK7-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: shl_32bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $24, %cl
+; FALLBACK8-NEXT:    negb %cl
+; FALLBACK8-NEXT:    movsbq %cl, %r8
+; FALLBACK8-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT:    movq %r10, %rdi
+; FALLBACK8-NEXT:    shrq %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r9, %rdi
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT:    movq %r8, %r11
+; FALLBACK8-NEXT:    shrq %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    orq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r8
+; FALLBACK8-NEXT:    movq %r9, %r10
+; FALLBACK8-NEXT:    shrq %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, (%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: shl_32bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    movzbl (%rsi), %eax
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $24, %al
+; FALLBACK9-NEXT:    negb %al
+; FALLBACK9-NEXT:    movsbq %al, %rax
+; FALLBACK9-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT:    movq %r8, %r9
+; FALLBACK9-NEXT:    shlq %cl, %r9
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK9-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: shl_32bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $24, %cl
+; FALLBACK10-NEXT:    negb %cl
+; FALLBACK10-NEXT:    movsbq %cl, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    shrq %rdi
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    shrq %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r8, %rcx
+; FALLBACK10-NEXT:    shrq %r9
+; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: shl_32bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    movzbl (%rsi), %eax
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $24, %al
+; FALLBACK11-NEXT:    negb %al
+; FALLBACK11-NEXT:    movsbq %al, %rax
+; FALLBACK11-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK11-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: shl_32bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $24, %cl
+; FALLBACK12-NEXT:    negb %cl
+; FALLBACK12-NEXT:    movsbq %cl, %r8
+; FALLBACK12-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT:    movq %r10, %rdi
+; FALLBACK12-NEXT:    shrq %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r9, %rdi
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT:    movq %r8, %r11
+; FALLBACK12-NEXT:    shrq %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    orq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r8
+; FALLBACK12-NEXT:    movq %r9, %r10
+; FALLBACK12-NEXT:    shrq %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, (%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: shl_32bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    movzbl (%rsi), %eax
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $24, %al
+; FALLBACK13-NEXT:    negb %al
+; FALLBACK13-NEXT:    movsbq %al, %rax
+; FALLBACK13-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT:    movq %r8, %r9
+; FALLBACK13-NEXT:    shlq %cl, %r9
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK13-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: shl_32bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $24, %cl
+; FALLBACK14-NEXT:    negb %cl
+; FALLBACK14-NEXT:    movsbq %cl, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    shrq %rdi
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    shrq %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r8, %rcx
+; FALLBACK14-NEXT:    shrq %r9
+; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: shl_32bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    movzbl (%rsi), %eax
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $24, %al
+; FALLBACK15-NEXT:    negb %al
+; FALLBACK15-NEXT:    movsbq %al, %rax
+; FALLBACK15-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK15-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: shl_32bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl 8(%ecx), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%ecx), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ecx), %esi
+; FALLBACK16-NEXT:    movl 28(%ecx), %edi
+; FALLBACK16-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movl (%ecx), %ebp
+; FALLBACK16-NEXT:    movl 4(%ecx), %edx
+; FALLBACK16-NEXT:    movl 20(%ecx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movb %ah, %ch
+; FALLBACK16-NEXT:    shlb $3, %ch
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %ah
+; FALLBACK16-NEXT:    negb %ah
+; FALLBACK16-NEXT:    movsbl %ah, %edi
+; FALLBACK16-NEXT:    movl 68(%esp,%edi), %eax
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movb %ch, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl 64(%esp,%edi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    shrl %ebx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %esi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%edi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 72(%esp,%edi), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%edi), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 80(%esp,%edi), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %ebx
+; FALLBACK16-NEXT:    shrl %ebx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %eax, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%edi), %eax
+; FALLBACK16-NEXT:    movl 88(%esp,%edi), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl %ebp, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %ebp, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %esi, 24(%eax)
+; FALLBACK16-NEXT:    movl %edi, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl %ebx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $108, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: shl_32bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $108, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl 12(%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%eax), %esi
+; FALLBACK17-NEXT:    movl 24(%eax), %edi
+; FALLBACK17-NEXT:    movl 20(%eax), %ebx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movl (%eax), %edx
+; FALLBACK17-NEXT:    movl 4(%eax), %ebp
+; FALLBACK17-NEXT:    movl 16(%eax), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $28, %ch
+; FALLBACK17-NEXT:    negb %ch
+; FALLBACK17-NEXT:    movsbl %ch, %ebx
+; FALLBACK17-NEXT:    movl 64(%esp,%ebx), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%ebx), %ebp
+; FALLBACK17-NEXT:    movl 68(%esp,%ebx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 84(%esp,%ebx), %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%ebx), %edi
+; FALLBACK17-NEXT:    movl 88(%esp,%ebx), %edx
+; FALLBACK17-NEXT:    movl 76(%esp,%ebx), %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 92(%esp,%ebx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %ebp
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl %edx, 24(%eax)
+; FALLBACK17-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK17-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK17-NEXT:    movl %esi, 20(%eax)
+; FALLBACK17-NEXT:    movl %edi, 8(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, 12(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shll %cl, %edx
+; FALLBACK17-NEXT:    movl %edx, (%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK17-NEXT:    addl $108, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: shl_32bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $108, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %esi
+; FALLBACK18-NEXT:    movl 28(%eax), %edi
+; FALLBACK18-NEXT:    movl 16(%eax), %edx
+; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK18-NEXT:    movl (%eax), %ebp
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl 20(%eax), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $28, %bl
+; FALLBACK18-NEXT:    negb %bl
+; FALLBACK18-NEXT:    movsbl %bl, %esi
+; FALLBACK18-NEXT:    movl 68(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    movl 64(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ebx
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 72(%esp,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %eax, %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %ecx, %eax
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK18-NEXT:    movl 84(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %eax, 92(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl 88(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shlxl %eax, %esi, %eax
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ebx, %esi
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %ebp, (%eax)
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl %esi, 28(%eax)
+; FALLBACK18-NEXT:    movl %edi, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $108, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: shl_32bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $108, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl 12(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%eax), %esi
+; FALLBACK19-NEXT:    movl 24(%eax), %edi
+; FALLBACK19-NEXT:    movl 20(%eax), %edx
+; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK19-NEXT:    movl (%eax), %ecx
+; FALLBACK19-NEXT:    movl 4(%eax), %ebp
+; FALLBACK19-NEXT:    movl 16(%eax), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $28, %bl
+; FALLBACK19-NEXT:    negb %bl
+; FALLBACK19-NEXT:    movsbl %bl, %ebx
+; FALLBACK19-NEXT:    movl 64(%esp,%ebx), %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 80(%esp,%ebx), %esi
+; FALLBACK19-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 84(%esp,%ebx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl %edx, %eax
+; FALLBACK19-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 72(%esp,%ebx), %edi
+; FALLBACK19-NEXT:    movl 88(%esp,%ebx), %edx
+; FALLBACK19-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 92(%esp,%ebx), %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK19-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %ebp, %edx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl %edx, 24(%ecx)
+; FALLBACK19-NEXT:    movl %ebx, 28(%ecx)
+; FALLBACK19-NEXT:    movl %esi, 16(%ecx)
+; FALLBACK19-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK19-NEXT:    movl %edi, 8(%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, (%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK19-NEXT:    addl $108, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: shl_32bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $108, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movb %cl, %dh
+; FALLBACK20-NEXT:    shlb $3, %dh
+; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $28, %cl
+; FALLBACK20-NEXT:    negb %cl
+; FALLBACK20-NEXT:    movsbl %cl, %ebp
+; FALLBACK20-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    movb %dh, %dl
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %esi, %ebp
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %edi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl %ebx, %eax
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 72(%esp,%ebx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 68(%esp,%eax), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    shrl %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl 88(%esp,%eax), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %edi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %edi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl 92(%esp,%eax), %edi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %edx, (%eax)
+; FALLBACK20-NEXT:    movl %ebp, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK20-NEXT:    movl %esi, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    addl $108, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: shl_32bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $92, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movzbl (%eax), %eax
+; FALLBACK21-NEXT:    movl %eax, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $28, %al
+; FALLBACK21-NEXT:    negb %al
+; FALLBACK21-NEXT:    movsbl %al, %ebp
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %esi
+; FALLBACK21-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl %esi, %edx
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl %edx, 24(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %edx, %eax
+; FALLBACK21-NEXT:    shll %cl, %eax
+; FALLBACK21-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 12(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 16(%ebp)
+; FALLBACK21-NEXT:    movl %eax, (%ebp)
+; FALLBACK21-NEXT:    addl $92, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: shl_32bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $92, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $28, %cl
+; FALLBACK22-NEXT:    negb %cl
+; FALLBACK22-NEXT:    movsbl %cl, %esi
+; FALLBACK22-NEXT:    movl 64(%esp,%esi), %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK22-NEXT:    movl %eax, %edx
+; FALLBACK22-NEXT:    notb %dl
+; FALLBACK22-NEXT:    movl 60(%esp,%esi), %ebp
+; FALLBACK22-NEXT:    movl %ebp, %ebx
+; FALLBACK22-NEXT:    shrl %ebx
+; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT:    orl %edi, %ebx
+; FALLBACK22-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 68(%esp,%esi), %ebx
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %edi
+; FALLBACK22-NEXT:    shlxl %eax, %ebx, %ecx
+; FALLBACK22-NEXT:    orl %ecx, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %ebp, %ecx
+; FALLBACK22-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK22-NEXT:    movl %edi, %ebp
+; FALLBACK22-NEXT:    shrl %ebp
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %ecx, %ebp
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %ecx
+; FALLBACK22-NEXT:    movl 52(%esp,%esi), %edi
+; FALLBACK22-NEXT:    movl %edi, %ebp
+; FALLBACK22-NEXT:    shrl %ebp
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %ecx, %ebp
+; FALLBACK22-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %ebp
+; FALLBACK22-NEXT:    movl 48(%esp,%esi), %edi
+; FALLBACK22-NEXT:    movl %edi, %ecx
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %eax, 76(%esp,%esi), %ebp
+; FALLBACK22-NEXT:    movl 72(%esp,%esi), %esi
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %eax
+; FALLBACK22-NEXT:    shrl %ebx
+; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %edi, (%edx)
+; FALLBACK22-NEXT:    movl %eax, 28(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK22-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 16(%edx)
+; FALLBACK22-NEXT:    addl $92, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: shl_32bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $92, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movzbl (%eax), %eax
+; FALLBACK23-NEXT:    movl %eax, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $28, %al
+; FALLBACK23-NEXT:    negb %al
+; FALLBACK23-NEXT:    movsbl %al, %ebx
+; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl %edx, %edi
+; FALLBACK23-NEXT:    shldl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK23-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK23-NEXT:    movl 72(%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl %esi, %eax
+; FALLBACK23-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl 76(%esp,%ebx), %ebx
+; FALLBACK23-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK23-NEXT:    movl %ebx, 28(%esi)
+; FALLBACK23-NEXT:    movl %eax, 24(%esi)
+; FALLBACK23-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT:    movl %ebp, 4(%esi)
+; FALLBACK23-NEXT:    movl %edi, 8(%esi)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 20(%esi)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 16(%esi)
+; FALLBACK23-NEXT:    movl %eax, (%esi)
+; FALLBACK23-NEXT:    addl $92, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: shl_32bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $108, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movb %cl, %dh
+; FALLBACK24-NEXT:    shlb $3, %dh
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $28, %cl
+; FALLBACK24-NEXT:    negb %cl
+; FALLBACK24-NEXT:    movsbl %cl, %ebp
+; FALLBACK24-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    movb %dh, %dl
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %esi, %ebp
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %edi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl %ebx, %eax
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 72(%esp,%ebx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 68(%esp,%eax), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    shrl %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl 88(%esp,%eax), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %edi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %edi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl 92(%esp,%eax), %edi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %edx, (%eax)
+; FALLBACK24-NEXT:    movl %ebp, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK24-NEXT:    movl %esi, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    addl $108, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: shl_32bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $92, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    movzbl (%eax), %eax
+; FALLBACK25-NEXT:    movl %eax, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $28, %al
+; FALLBACK25-NEXT:    negb %al
+; FALLBACK25-NEXT:    movsbl %al, %ebp
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %esi
+; FALLBACK25-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl %esi, %edx
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl %edx, 24(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %edx, %eax
+; FALLBACK25-NEXT:    shll %cl, %eax
+; FALLBACK25-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 12(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 16(%ebp)
+; FALLBACK25-NEXT:    movl %eax, (%ebp)
+; FALLBACK25-NEXT:    addl $92, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: shl_32bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $92, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $28, %cl
+; FALLBACK26-NEXT:    negb %cl
+; FALLBACK26-NEXT:    movsbl %cl, %esi
+; FALLBACK26-NEXT:    movl 64(%esp,%esi), %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK26-NEXT:    movl %eax, %edx
+; FALLBACK26-NEXT:    notb %dl
+; FALLBACK26-NEXT:    movl 60(%esp,%esi), %ebp
+; FALLBACK26-NEXT:    movl %ebp, %ebx
+; FALLBACK26-NEXT:    shrl %ebx
+; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT:    orl %edi, %ebx
+; FALLBACK26-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 68(%esp,%esi), %ebx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %ecx, %edi
+; FALLBACK26-NEXT:    shlxl %eax, %ebx, %ecx
+; FALLBACK26-NEXT:    orl %ecx, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %ebp, %ecx
+; FALLBACK26-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK26-NEXT:    movl %edi, %ebp
+; FALLBACK26-NEXT:    shrl %ebp
+; FALLBACK26-NEXT:    shrxl %edx, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %ecx, %ebp
+; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %ecx
+; FALLBACK26-NEXT:    movl 52(%esp,%esi), %edi
+; FALLBACK26-NEXT:    movl %edi, %ebp
+; FALLBACK26-NEXT:    shrl %ebp
+; FALLBACK26-NEXT:    shrxl %edx, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %ecx, %ebp
+; FALLBACK26-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %ebp
+; FALLBACK26-NEXT:    movl 48(%esp,%esi), %edi
+; FALLBACK26-NEXT:    movl %edi, %ecx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %eax, 76(%esp,%esi), %ebp
+; FALLBACK26-NEXT:    movl 72(%esp,%esi), %esi
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %eax
+; FALLBACK26-NEXT:    shrl %ebx
+; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %edi, (%edx)
+; FALLBACK26-NEXT:    movl %eax, 28(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK26-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 16(%edx)
+; FALLBACK26-NEXT:    addl $92, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: shl_32bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $92, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    movzbl (%eax), %eax
+; FALLBACK27-NEXT:    movl %eax, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $28, %al
+; FALLBACK27-NEXT:    negb %al
+; FALLBACK27-NEXT:    movsbl %al, %ebx
+; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl %edx, %edi
+; FALLBACK27-NEXT:    shldl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK27-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK27-NEXT:    movl 72(%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl %esi, %eax
+; FALLBACK27-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl 76(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK27-NEXT:    movl %ebx, 28(%esi)
+; FALLBACK27-NEXT:    movl %eax, 24(%esi)
+; FALLBACK27-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK27-NEXT:    movl %ebp, 4(%esi)
+; FALLBACK27-NEXT:    movl %edi, 8(%esi)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 20(%esi)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 16(%esi)
+; FALLBACK27-NEXT:    movl %eax, (%esi)
+; FALLBACK27-NEXT:    addl $92, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: shl_32bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $108, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movb %cl, %dh
+; FALLBACK28-NEXT:    shlb $3, %dh
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $28, %cl
+; FALLBACK28-NEXT:    negb %cl
+; FALLBACK28-NEXT:    movsbl %cl, %ebp
+; FALLBACK28-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    movb %dh, %dl
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %esi, %ebp
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %edi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl %ebx, %eax
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 72(%esp,%ebx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 68(%esp,%eax), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    shrl %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl 88(%esp,%eax), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %edi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %edi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl 92(%esp,%eax), %edi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %edx, (%eax)
+; FALLBACK28-NEXT:    movl %ebp, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK28-NEXT:    movl %esi, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    addl $108, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: shl_32bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $92, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT:    movzbl (%eax), %eax
+; FALLBACK29-NEXT:    movl %eax, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $28, %al
+; FALLBACK29-NEXT:    negb %al
+; FALLBACK29-NEXT:    movsbl %al, %ebp
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %esi
+; FALLBACK29-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl %esi, %edx
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl %edx, 24(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %edx, %eax
+; FALLBACK29-NEXT:    shll %cl, %eax
+; FALLBACK29-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 12(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 16(%ebp)
+; FALLBACK29-NEXT:    movl %eax, (%ebp)
+; FALLBACK29-NEXT:    addl $92, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: shl_32bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $92, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $28, %cl
+; FALLBACK30-NEXT:    negb %cl
+; FALLBACK30-NEXT:    movsbl %cl, %esi
+; FALLBACK30-NEXT:    movl 64(%esp,%esi), %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK30-NEXT:    movl %eax, %edx
+; FALLBACK30-NEXT:    notb %dl
+; FALLBACK30-NEXT:    movl 60(%esp,%esi), %ebp
+; FALLBACK30-NEXT:    movl %ebp, %ebx
+; FALLBACK30-NEXT:    shrl %ebx
+; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT:    orl %edi, %ebx
+; FALLBACK30-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 68(%esp,%esi), %ebx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %edx, %ecx, %edi
+; FALLBACK30-NEXT:    shlxl %eax, %ebx, %ecx
+; FALLBACK30-NEXT:    orl %ecx, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %ebp, %ecx
+; FALLBACK30-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK30-NEXT:    movl %edi, %ebp
+; FALLBACK30-NEXT:    shrl %ebp
+; FALLBACK30-NEXT:    shrxl %edx, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %ecx, %ebp
+; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %ecx
+; FALLBACK30-NEXT:    movl 52(%esp,%esi), %edi
+; FALLBACK30-NEXT:    movl %edi, %ebp
+; FALLBACK30-NEXT:    shrl %ebp
+; FALLBACK30-NEXT:    shrxl %edx, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %ecx, %ebp
+; FALLBACK30-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %ebp
+; FALLBACK30-NEXT:    movl 48(%esp,%esi), %edi
+; FALLBACK30-NEXT:    movl %edi, %ecx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %eax, 76(%esp,%esi), %ebp
+; FALLBACK30-NEXT:    movl 72(%esp,%esi), %esi
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %eax
+; FALLBACK30-NEXT:    shrl %ebx
+; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT:    orl %eax, %ebx
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %edi, (%edx)
+; FALLBACK30-NEXT:    movl %eax, 28(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK30-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 16(%edx)
+; FALLBACK30-NEXT:    addl $92, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: shl_32bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $92, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT:    movzbl (%eax), %eax
+; FALLBACK31-NEXT:    movl %eax, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $28, %al
+; FALLBACK31-NEXT:    negb %al
+; FALLBACK31-NEXT:    movsbl %al, %ebx
+; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl %edx, %edi
+; FALLBACK31-NEXT:    shldl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK31-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK31-NEXT:    movl 72(%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl %esi, %eax
+; FALLBACK31-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl 76(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK31-NEXT:    movl %ebx, 28(%esi)
+; FALLBACK31-NEXT:    movl %eax, 24(%esi)
+; FALLBACK31-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK31-NEXT:    movl %ebp, 4(%esi)
+; FALLBACK31-NEXT:    movl %edi, 8(%esi)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 20(%esi)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 16(%esi)
+; FALLBACK31-NEXT:    movl %eax, (%esi)
+; FALLBACK31-NEXT:    addl $92, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -1430,222 +6532,2199 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_32bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
-; X64-SSE2-NEXT:    movzbl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    sarq $63, %rdi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $31, %esi
-; X64-SSE2-NEXT:    movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rax, (%rdx)
-; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE42-LABEL: ashr_32bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movq 16(%rdi), %rax
-; X64-SSE42-NEXT:    movq 24(%rdi), %rcx
-; X64-SSE42-NEXT:    movzbl (%rsi), %esi
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    sarq $63, %rcx
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $31, %esi
-; X64-SSE42-NEXT:    movups -64(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT:    movups -48(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
-;
-; X64-AVX-LABEL: ashr_32bytes:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
-; X64-AVX-NEXT:    movq 16(%rdi), %rax
-; X64-AVX-NEXT:    movq 24(%rdi), %rcx
-; X64-AVX-NEXT:    movzbl (%rsi), %esi
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    sarq $63, %rcx
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $31, %esi
-; X64-AVX-NEXT:    vmovups -64(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT:    vmovups -48(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    retq
-;
-; X86-SSE2-LABEL: ashr_32bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $72, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %edi
-; X86-SSE2-NEXT:    movl 12(%eax), %ebx
-; X86-SSE2-NEXT:    movl 16(%eax), %ebp
-; X86-SSE2-NEXT:    movl 20(%eax), %esi
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    sarl $31, %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $31, %eax
-; X86-SSE2-NEXT:    movl 8(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl %edx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $72, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: ashr_32bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $64, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movl 16(%edx), %esi
-; X86-SSE42-NEXT:    movl 20(%edx), %edi
-; X86-SSE42-NEXT:    movl 24(%edx), %ebx
-; X86-SSE42-NEXT:    movl 28(%edx), %edx
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $31, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $64, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: ashr_32bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $64, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movl 16(%edx), %esi
-; X86-AVX-NEXT:    movl 20(%edx), %edi
-; X86-AVX-NEXT:    movl 24(%edx), %ebx
-; X86-AVX-NEXT:    movl 28(%edx), %edx
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $31, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $64, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
-; X86-AVX-NEXT:    retl
+; FALLBACK0-LABEL: ashr_32bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK0-NEXT:    movq (%rdi), %r8
+; FALLBACK0-NEXT:    movq 8(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    sarq $63, %rdi
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    movzbl %sil, %r9d
+; FALLBACK0-NEXT:    movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT:    movq -40(%rsp,%r9), %r8
+; FALLBACK0-NEXT:    movq %rdi, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -64(%rsp,%r9), %r11
+; FALLBACK0-NEXT:    movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    orq %r10, %r9
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    addq %rdi, %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r11, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    sarq %cl, %r8
+; FALLBACK0-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, (%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: ashr_32bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq 24(%rdi), %rax
+; FALLBACK1-NEXT:    movq (%rdi), %r8
+; FALLBACK1-NEXT:    movq 8(%rdi), %r9
+; FALLBACK1-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    sarq $63, %rax
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    movzbl %sil, %eax
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %r8, %rsi
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %rdi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    sarq %cl, %rax
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: ashr_32bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK2-NEXT:    movq (%rdi), %r8
+; FALLBACK2-NEXT:    movq 8(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    sarq $63, %rdi
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    movzbl %sil, %ecx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %r9
+; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %r9, %r10
+; FALLBACK2-NEXT:    sarxq %rax, %rdi, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    addq %r9, %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %r9
+; FALLBACK2-NEXT:    orq %r8, %r9
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %rcx, %rsi
+; FALLBACK2-NEXT:    leaq (%rdi,%rdi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, (%rdx)
+; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: ashr_32bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq 24(%rdi), %rax
+; FALLBACK3-NEXT:    movq (%rdi), %r8
+; FALLBACK3-NEXT:    movq 8(%rdi), %r9
+; FALLBACK3-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    sarq $63, %rax
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    movzbl %sil, %eax
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %r8, %rsi
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %rdi
+; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: ashr_32bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK4-NEXT:    movzbl (%rsi), %esi
+; FALLBACK4-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    sarq $63, %rcx
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $24, %sil
+; FALLBACK4-NEXT:    movzbl %sil, %r8d
+; FALLBACK4-NEXT:    movq -64(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movq -48(%rsp,%r8), %rdi
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -56(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq -40(%rsp,%r8), %r11
+; FALLBACK4-NEXT:    leaq (%r10,%r10), %r8
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r8
+; FALLBACK4-NEXT:    orq %r9, %r8
+; FALLBACK4-NEXT:    movq %rdi, %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    leaq (%r11,%r11), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r9, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    addq %rdi, %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    sarq %cl, %r11
+; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %r8, (%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: ashr_32bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movq 16(%rdi), %rax
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK5-NEXT:    movzbl (%rsi), %esi
+; FALLBACK5-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    sarq $63, %rdi
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $24, %sil
+; FALLBACK5-NEXT:    movzbl %sil, %eax
+; FALLBACK5-NEXT:    movq -64(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -48(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r8
+; FALLBACK5-NEXT:    movq -56(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT:    movq %rax, %r9
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    sarq %cl, %rdi
+; FALLBACK5-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r8, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: ashr_32bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    sarq $63, %rcx
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $24, %sil
+; FALLBACK6-NEXT:    movzbl %sil, %ecx
+; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r9
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT:    sarxq %rax, %r8, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %rsi
+; FALLBACK6-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %r9, %rsi
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: ashr_32bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movq 16(%rdi), %rax
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK7-NEXT:    movzbl (%rsi), %esi
+; FALLBACK7-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    sarq $63, %rdi
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $24, %sil
+; FALLBACK7-NEXT:    movzbl %sil, %eax
+; FALLBACK7-NEXT:    movq -64(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -48(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r8
+; FALLBACK7-NEXT:    movq -56(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT:    movq %rax, %r9
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK7-NEXT:    sarxq %rcx, %rdi, %rax
+; FALLBACK7-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r8, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: ashr_32bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK8-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK8-NEXT:    movzbl (%rsi), %esi
+; FALLBACK8-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    sarq $63, %rcx
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $24, %sil
+; FALLBACK8-NEXT:    movzbl %sil, %r8d
+; FALLBACK8-NEXT:    movq -64(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movq -48(%rsp,%r8), %rdi
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -56(%rsp,%r8), %r10
+; FALLBACK8-NEXT:    movq -40(%rsp,%r8), %r11
+; FALLBACK8-NEXT:    leaq (%r10,%r10), %r8
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r8
+; FALLBACK8-NEXT:    orq %r9, %r8
+; FALLBACK8-NEXT:    movq %rdi, %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    leaq (%r11,%r11), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r9, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    addq %rdi, %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    sarq %cl, %r11
+; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %r8, (%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: ashr_32bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    movq 16(%rdi), %rax
+; FALLBACK9-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK9-NEXT:    movzbl (%rsi), %esi
+; FALLBACK9-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    sarq $63, %rdi
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $24, %sil
+; FALLBACK9-NEXT:    movzbl %sil, %eax
+; FALLBACK9-NEXT:    movq -64(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -48(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r8
+; FALLBACK9-NEXT:    movq -56(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT:    movq %rax, %r9
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    sarq %cl, %rdi
+; FALLBACK9-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r8, (%rdx)
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: ashr_32bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK10-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    sarq $63, %rcx
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $24, %sil
+; FALLBACK10-NEXT:    movzbl %sil, %ecx
+; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r9
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT:    sarxq %rax, %r8, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %rsi
+; FALLBACK10-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %r9, %rsi
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: ashr_32bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    movq 16(%rdi), %rax
+; FALLBACK11-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK11-NEXT:    movzbl (%rsi), %esi
+; FALLBACK11-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    sarq $63, %rdi
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $24, %sil
+; FALLBACK11-NEXT:    movzbl %sil, %eax
+; FALLBACK11-NEXT:    movq -64(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -48(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r8
+; FALLBACK11-NEXT:    movq -56(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT:    movq %rax, %r9
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK11-NEXT:    sarxq %rcx, %rdi, %rax
+; FALLBACK11-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r8, (%rdx)
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: ashr_32bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK12-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK12-NEXT:    movzbl (%rsi), %esi
+; FALLBACK12-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    sarq $63, %rcx
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $24, %sil
+; FALLBACK12-NEXT:    movzbl %sil, %r8d
+; FALLBACK12-NEXT:    movq -64(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movq -48(%rsp,%r8), %rdi
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -56(%rsp,%r8), %r10
+; FALLBACK12-NEXT:    movq -40(%rsp,%r8), %r11
+; FALLBACK12-NEXT:    leaq (%r10,%r10), %r8
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r8
+; FALLBACK12-NEXT:    orq %r9, %r8
+; FALLBACK12-NEXT:    movq %rdi, %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    leaq (%r11,%r11), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r9, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    addq %rdi, %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    sarq %cl, %r11
+; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r8, (%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: ashr_32bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    movq 16(%rdi), %rax
+; FALLBACK13-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK13-NEXT:    movzbl (%rsi), %esi
+; FALLBACK13-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    sarq $63, %rdi
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $24, %sil
+; FALLBACK13-NEXT:    movzbl %sil, %eax
+; FALLBACK13-NEXT:    movq -64(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -48(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r8
+; FALLBACK13-NEXT:    movq -56(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT:    movq %rax, %r9
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    sarq %cl, %rdi
+; FALLBACK13-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r8, (%rdx)
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: ashr_32bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK14-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT:    movq 16(%rdi), %rdi
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    sarq $63, %rcx
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $24, %sil
+; FALLBACK14-NEXT:    movzbl %sil, %ecx
+; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r9
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT:    sarxq %rax, %r8, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %rsi
+; FALLBACK14-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r9, %rsi
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: ashr_32bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    movq 16(%rdi), %rax
+; FALLBACK15-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK15-NEXT:    movzbl (%rsi), %esi
+; FALLBACK15-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    sarq $63, %rdi
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $24, %sil
+; FALLBACK15-NEXT:    movzbl %sil, %eax
+; FALLBACK15-NEXT:    movq -64(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -48(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r8
+; FALLBACK15-NEXT:    movq -56(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT:    movq %rax, %r9
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r9
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK15-NEXT:    sarxq %rcx, %rdi, %rax
+; FALLBACK15-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r8, (%rdx)
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: ashr_32bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl 12(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %edi
+; FALLBACK16-NEXT:    movl 24(%eax), %ebx
+; FALLBACK16-NEXT:    movl 28(%eax), %esi
+; FALLBACK16-NEXT:    movzbl (%edx), %edx
+; FALLBACK16-NEXT:    movl (%eax), %ebp
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl 20(%eax), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movb %dl, %dh
+; FALLBACK16-NEXT:    shlb $3, %dh
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    sarl $31, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %dl
+; FALLBACK16-NEXT:    movzbl %dl, %ebx
+; FALLBACK16-NEXT:    movl 36(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movb %dh, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl 40(%esp,%ebx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 48(%esp,%ebx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebx, %eax
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 52(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl 56(%esp,%eax), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    sarl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %ebx, 28(%ecx)
+; FALLBACK16-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 16(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 20(%ecx)
+; FALLBACK16-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, (%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK16-NEXT:    addl $108, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: ashr_32bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $92, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl 12(%ecx), %edx
+; FALLBACK17-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%ecx), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl (%ecx), %edi
+; FALLBACK17-NEXT:    movl 4(%ecx), %ebx
+; FALLBACK17-NEXT:    movl 20(%ecx), %ebp
+; FALLBACK17-NEXT:    movzbl (%eax), %edx
+; FALLBACK17-NEXT:    movl 28(%ecx), %esi
+; FALLBACK17-NEXT:    movl 8(%ecx), %eax
+; FALLBACK17-NEXT:    movl 24(%ecx), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, %ecx
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    sarl $31, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $28, %dl
+; FALLBACK17-NEXT:    movzbl %dl, %eax
+; FALLBACK17-NEXT:    movl 24(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 20(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl 36(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%esp,%eax), %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 28(%esp,%eax), %ebp
+; FALLBACK17-NEXT:    movl 44(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %ebp, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, %ebp
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %ebp
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl %edi, 24(%eax)
+; FALLBACK17-NEXT:    sarl %cl, %edx
+; FALLBACK17-NEXT:    movl %edx, 28(%eax)
+; FALLBACK17-NEXT:    movl %ebx, 16(%eax)
+; FALLBACK17-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK17-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK17-NEXT:    movl %esi, 12(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, (%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK17-NEXT:    addl $92, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: ashr_32bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $108, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %edi
+; FALLBACK18-NEXT:    movl 24(%eax), %esi
+; FALLBACK18-NEXT:    movl 28(%eax), %ebp
+; FALLBACK18-NEXT:    movzbl (%edx), %edx
+; FALLBACK18-NEXT:    movl (%eax), %ebx
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl 20(%eax), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    sarl $31, %ebp
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $28, %dl
+; FALLBACK18-NEXT:    movzbl %dl, %esi
+; FALLBACK18-NEXT:    movl 36(%esp,%esi), %edi
+; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebx
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    movl 40(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %ebp
+; FALLBACK18-NEXT:    orl %ebx, %ebp
+; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %ebx, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%esp,%esi), %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %edi
+; FALLBACK18-NEXT:    movl 44(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT:    sarxl %ebx, %esi, %ebx
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT:    orl %eax, %edx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK18-NEXT:    movl %edx, 24(%eax)
+; FALLBACK18-NEXT:    movl %edi, 16(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $108, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: ashr_32bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $92, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl 12(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl (%eax), %edi
+; FALLBACK19-NEXT:    movl 4(%eax), %ebx
+; FALLBACK19-NEXT:    movl 20(%eax), %esi
+; FALLBACK19-NEXT:    movzbl (%edx), %edx
+; FALLBACK19-NEXT:    movl 28(%eax), %ebp
+; FALLBACK19-NEXT:    movl 8(%eax), %ecx
+; FALLBACK19-NEXT:    movl 24(%eax), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    sarl $31, %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $28, %dl
+; FALLBACK19-NEXT:    movzbl %dl, %eax
+; FALLBACK19-NEXT:    movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl 40(%esp,%eax), %ebp
+; FALLBACK19-NEXT:    movl 20(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 36(%esp,%eax), %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl %edx, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%esp,%eax), %ebx
+; FALLBACK19-NEXT:    movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 44(%esp,%eax), %eax
+; FALLBACK19-NEXT:    movl %edx, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT:    sarxl %ecx, %eax, %edx
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl %ebp, 24(%ecx)
+; FALLBACK19-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK19-NEXT:    movl %ebx, 16(%ecx)
+; FALLBACK19-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK19-NEXT:    movl %edi, 12(%ecx)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, (%ecx)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK19-NEXT:    addl $92, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: ashr_32bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $108, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK20-NEXT:    movl 24(%edi), %ecx
+; FALLBACK20-NEXT:    movl 20(%edi), %eax
+; FALLBACK20-NEXT:    movl 28(%edi), %esi
+; FALLBACK20-NEXT:    movups (%edi), %xmm0
+; FALLBACK20-NEXT:    movl 16(%edi), %edi
+; FALLBACK20-NEXT:    movzbl (%edx), %edx
+; FALLBACK20-NEXT:    movb %dl, %dh
+; FALLBACK20-NEXT:    shlb $3, %dh
+; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    sarl $31, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $28, %dl
+; FALLBACK20-NEXT:    movzbl %dl, %ebx
+; FALLBACK20-NEXT:    movl 32(%esp,%ebx), %eax
+; FALLBACK20-NEXT:    movl 48(%esp,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movb %dh, %dl
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl 36(%esp,%ebx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%esp,%ebx), %edi
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    leal (%ebp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 40(%esp,%ebx), %esi
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %eax, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 56(%esp,%ebx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %eax, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebp, %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl 60(%esp,%ebx), %ebx
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %edi, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %edi, %esi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    sarl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movl %ebx, 28(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK20-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK20-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, (%ecx)
+; FALLBACK20-NEXT:    addl $108, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: ashr_32bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $108, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT:    movl 24(%edx), %eax
+; FALLBACK21-NEXT:    movl 16(%edx), %esi
+; FALLBACK21-NEXT:    movl 28(%edx), %edi
+; FALLBACK21-NEXT:    movups (%edx), %xmm0
+; FALLBACK21-NEXT:    movl 20(%edx), %ebx
+; FALLBACK21-NEXT:    movzbl (%ecx), %edx
+; FALLBACK21-NEXT:    movl %edx, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    sarl $31, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $28, %dl
+; FALLBACK21-NEXT:    movzbl %dl, %eax
+; FALLBACK21-NEXT:    movl 36(%esp,%eax), %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 32(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl 48(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 44(%esp,%eax), %esi
+; FALLBACK21-NEXT:    movl %esi, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 52(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 60(%esp,%eax), %ebx
+; FALLBACK21-NEXT:    movl 40(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%esp,%eax), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, %ebp
+; FALLBACK21-NEXT:    movl %edi, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %eax
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK21-NEXT:    movl %esi, 4(%edi)
+; FALLBACK21-NEXT:    movl %eax, 24(%edi)
+; FALLBACK21-NEXT:    sarl %cl, %ebx
+; FALLBACK21-NEXT:    movl %ebx, 28(%edi)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%edi)
+; FALLBACK21-NEXT:    movl %edx, 20(%edi)
+; FALLBACK21-NEXT:    movl %ebp, 8(%edi)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%edi)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, (%edi)
+; FALLBACK21-NEXT:    addl $108, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: ashr_32bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $108, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movl 24(%ecx), %edx
+; FALLBACK22-NEXT:    movl 20(%ecx), %esi
+; FALLBACK22-NEXT:    movl 28(%ecx), %edi
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    sarl $31, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $28, %cl
+; FALLBACK22-NEXT:    movzbl %cl, %edi
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %eax, %edx
+; FALLBACK22-NEXT:    movl %eax, %ebx
+; FALLBACK22-NEXT:    notb %dl
+; FALLBACK22-NEXT:    movl 36(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %ebx, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebx
+; FALLBACK22-NEXT:    orl %ebx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %esi, %ecx, %ebx
+; FALLBACK22-NEXT:    movl %esi, %ecx
+; FALLBACK22-NEXT:    orl %ebx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%edi), %ebx
+; FALLBACK22-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK22-NEXT:    shrxl %esi, %eax, %esi
+; FALLBACK22-NEXT:    orl %esi, %ebp
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT:    sarxl %ecx, %edi, %ebx
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    movl %edi, 24(%eax)
+; FALLBACK22-NEXT:    movl %esi, 16(%eax)
+; FALLBACK22-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, (%eax)
+; FALLBACK22-NEXT:    addl $108, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: ashr_32bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $108, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT:    movl 24(%edx), %eax
+; FALLBACK23-NEXT:    movl 16(%edx), %esi
+; FALLBACK23-NEXT:    movl 28(%edx), %edi
+; FALLBACK23-NEXT:    movups (%edx), %xmm0
+; FALLBACK23-NEXT:    movl 20(%edx), %ebx
+; FALLBACK23-NEXT:    movzbl (%ecx), %edx
+; FALLBACK23-NEXT:    movl %edx, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    sarl $31, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $28, %dl
+; FALLBACK23-NEXT:    movzbl %dl, %eax
+; FALLBACK23-NEXT:    movl 36(%esp,%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 32(%esp,%eax), %edi
+; FALLBACK23-NEXT:    movl 48(%esp,%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 44(%esp,%eax), %edi
+; FALLBACK23-NEXT:    movl %edi, %ebx
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 52(%esp,%eax), %esi
+; FALLBACK23-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 40(%esp,%eax), %ebx
+; FALLBACK23-NEXT:    movl 56(%esp,%eax), %ebp
+; FALLBACK23-NEXT:    movl %ebx, %eax
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl %esi, %eax
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT:    sarxl %ecx, %eax, %edi
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %ebx, %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK23-NEXT:    movl %ebp, 24(%ecx)
+; FALLBACK23-NEXT:    movl %edi, 28(%ecx)
+; FALLBACK23-NEXT:    movl %edx, 16(%ecx)
+; FALLBACK23-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, (%ecx)
+; FALLBACK23-NEXT:    addl $108, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: ashr_32bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $108, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK24-NEXT:    movl 24(%edi), %ecx
+; FALLBACK24-NEXT:    movl 20(%edi), %eax
+; FALLBACK24-NEXT:    movl 28(%edi), %esi
+; FALLBACK24-NEXT:    vmovups (%edi), %xmm0
+; FALLBACK24-NEXT:    movl 16(%edi), %edi
+; FALLBACK24-NEXT:    movzbl (%edx), %edx
+; FALLBACK24-NEXT:    movb %dl, %dh
+; FALLBACK24-NEXT:    shlb $3, %dh
+; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    sarl $31, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $28, %dl
+; FALLBACK24-NEXT:    movzbl %dl, %ebx
+; FALLBACK24-NEXT:    movl 32(%esp,%ebx), %eax
+; FALLBACK24-NEXT:    movl 48(%esp,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movb %dh, %dl
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl 36(%esp,%ebx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%esp,%ebx), %edi
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    leal (%ebp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 40(%esp,%ebx), %esi
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %eax, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 56(%esp,%ebx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %eax, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebp, %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl 60(%esp,%ebx), %ebx
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %edi, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %edi, %esi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    sarl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    movl %ebx, 28(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK24-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK24-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, (%ecx)
+; FALLBACK24-NEXT:    addl $108, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: ashr_32bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $108, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT:    movl 24(%edx), %eax
+; FALLBACK25-NEXT:    movl 16(%edx), %esi
+; FALLBACK25-NEXT:    movl 28(%edx), %edi
+; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK25-NEXT:    movl 20(%edx), %ebx
+; FALLBACK25-NEXT:    movzbl (%ecx), %edx
+; FALLBACK25-NEXT:    movl %edx, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    sarl $31, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $28, %dl
+; FALLBACK25-NEXT:    movzbl %dl, %eax
+; FALLBACK25-NEXT:    movl 36(%esp,%eax), %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 32(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl 48(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 44(%esp,%eax), %esi
+; FALLBACK25-NEXT:    movl %esi, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 52(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 60(%esp,%eax), %ebx
+; FALLBACK25-NEXT:    movl 40(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%esp,%eax), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, %ebp
+; FALLBACK25-NEXT:    movl %edi, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %eax
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK25-NEXT:    movl %esi, 4(%edi)
+; FALLBACK25-NEXT:    movl %eax, 24(%edi)
+; FALLBACK25-NEXT:    sarl %cl, %ebx
+; FALLBACK25-NEXT:    movl %ebx, 28(%edi)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%edi)
+; FALLBACK25-NEXT:    movl %edx, 20(%edi)
+; FALLBACK25-NEXT:    movl %ebp, 8(%edi)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%edi)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, (%edi)
+; FALLBACK25-NEXT:    addl $108, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: ashr_32bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $108, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    movl 24(%ecx), %edx
+; FALLBACK26-NEXT:    movl 20(%ecx), %esi
+; FALLBACK26-NEXT:    movl 28(%ecx), %edi
+; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    sarl $31, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $28, %cl
+; FALLBACK26-NEXT:    movzbl %cl, %edi
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %eax, %edx
+; FALLBACK26-NEXT:    movl %eax, %ebx
+; FALLBACK26-NEXT:    notb %dl
+; FALLBACK26-NEXT:    movl 36(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %ebx, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT:    orl %ebx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %esi, %ecx, %ebx
+; FALLBACK26-NEXT:    movl %esi, %ecx
+; FALLBACK26-NEXT:    orl %ebx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%edi), %ebx
+; FALLBACK26-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK26-NEXT:    shrxl %esi, %eax, %esi
+; FALLBACK26-NEXT:    orl %esi, %ebp
+; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT:    sarxl %ecx, %edi, %ebx
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    movl %edi, 24(%eax)
+; FALLBACK26-NEXT:    movl %esi, 16(%eax)
+; FALLBACK26-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, (%eax)
+; FALLBACK26-NEXT:    addl $108, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: ashr_32bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $108, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT:    movl 24(%edx), %eax
+; FALLBACK27-NEXT:    movl 16(%edx), %esi
+; FALLBACK27-NEXT:    movl 28(%edx), %edi
+; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK27-NEXT:    movl 20(%edx), %ebx
+; FALLBACK27-NEXT:    movzbl (%ecx), %edx
+; FALLBACK27-NEXT:    movl %edx, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    sarl $31, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $28, %dl
+; FALLBACK27-NEXT:    movzbl %dl, %eax
+; FALLBACK27-NEXT:    movl 36(%esp,%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 32(%esp,%eax), %edi
+; FALLBACK27-NEXT:    movl 48(%esp,%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 44(%esp,%eax), %edi
+; FALLBACK27-NEXT:    movl %edi, %ebx
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK27-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 52(%esp,%eax), %esi
+; FALLBACK27-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 40(%esp,%eax), %ebx
+; FALLBACK27-NEXT:    movl 56(%esp,%eax), %ebp
+; FALLBACK27-NEXT:    movl %ebx, %eax
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl %esi, %eax
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT:    sarxl %ecx, %eax, %edi
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %ebx, %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK27-NEXT:    movl %ebp, 24(%ecx)
+; FALLBACK27-NEXT:    movl %edi, 28(%ecx)
+; FALLBACK27-NEXT:    movl %edx, 16(%ecx)
+; FALLBACK27-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, (%ecx)
+; FALLBACK27-NEXT:    addl $108, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: ashr_32bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $108, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK28-NEXT:    movl 24(%edi), %ecx
+; FALLBACK28-NEXT:    movl 20(%edi), %eax
+; FALLBACK28-NEXT:    movl 28(%edi), %esi
+; FALLBACK28-NEXT:    vmovups (%edi), %xmm0
+; FALLBACK28-NEXT:    movl 16(%edi), %edi
+; FALLBACK28-NEXT:    movzbl (%edx), %edx
+; FALLBACK28-NEXT:    movb %dl, %dh
+; FALLBACK28-NEXT:    shlb $3, %dh
+; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    sarl $31, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $28, %dl
+; FALLBACK28-NEXT:    movzbl %dl, %ebx
+; FALLBACK28-NEXT:    movl 32(%esp,%ebx), %eax
+; FALLBACK28-NEXT:    movl 48(%esp,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movb %dh, %dl
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl 36(%esp,%ebx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%esp,%ebx), %edi
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    leal (%ebp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 40(%esp,%ebx), %esi
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %eax, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 56(%esp,%ebx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %eax, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebp, %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl 60(%esp,%ebx), %ebx
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %edi, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %edi, %esi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    sarl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    movl %ebx, 28(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK28-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK28-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, (%ecx)
+; FALLBACK28-NEXT:    addl $108, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: ashr_32bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $108, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT:    movl 24(%edx), %eax
+; FALLBACK29-NEXT:    movl 16(%edx), %esi
+; FALLBACK29-NEXT:    movl 28(%edx), %edi
+; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK29-NEXT:    movl 20(%edx), %ebx
+; FALLBACK29-NEXT:    movzbl (%ecx), %edx
+; FALLBACK29-NEXT:    movl %edx, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    sarl $31, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $28, %dl
+; FALLBACK29-NEXT:    movzbl %dl, %eax
+; FALLBACK29-NEXT:    movl 36(%esp,%eax), %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 32(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl 48(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 44(%esp,%eax), %esi
+; FALLBACK29-NEXT:    movl %esi, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 52(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 60(%esp,%eax), %ebx
+; FALLBACK29-NEXT:    movl 40(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%esp,%eax), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, %ebp
+; FALLBACK29-NEXT:    movl %edi, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %eax
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK29-NEXT:    movl %esi, 4(%edi)
+; FALLBACK29-NEXT:    movl %eax, 24(%edi)
+; FALLBACK29-NEXT:    sarl %cl, %ebx
+; FALLBACK29-NEXT:    movl %ebx, 28(%edi)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%edi)
+; FALLBACK29-NEXT:    movl %edx, 20(%edi)
+; FALLBACK29-NEXT:    movl %ebp, 8(%edi)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%edi)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, (%edi)
+; FALLBACK29-NEXT:    addl $108, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: ashr_32bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $108, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    movl 24(%ecx), %edx
+; FALLBACK30-NEXT:    movl 20(%ecx), %esi
+; FALLBACK30-NEXT:    movl 28(%ecx), %edi
+; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    sarl $31, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $28, %cl
+; FALLBACK30-NEXT:    movzbl %cl, %edi
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %eax, %edx
+; FALLBACK30-NEXT:    movl %eax, %ebx
+; FALLBACK30-NEXT:    notb %dl
+; FALLBACK30-NEXT:    movl 36(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %ebx, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebx
+; FALLBACK30-NEXT:    orl %ebx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %esi, %ecx, %ebx
+; FALLBACK30-NEXT:    movl %esi, %ecx
+; FALLBACK30-NEXT:    orl %ebx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%edi), %ebx
+; FALLBACK30-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK30-NEXT:    shrxl %esi, %eax, %esi
+; FALLBACK30-NEXT:    orl %esi, %ebp
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT:    sarxl %ecx, %edi, %ebx
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    movl %edi, 24(%eax)
+; FALLBACK30-NEXT:    movl %esi, 16(%eax)
+; FALLBACK30-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, (%eax)
+; FALLBACK30-NEXT:    addl $108, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: ashr_32bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $108, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT:    movl 24(%edx), %eax
+; FALLBACK31-NEXT:    movl 16(%edx), %esi
+; FALLBACK31-NEXT:    movl 28(%edx), %edi
+; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK31-NEXT:    movl 20(%edx), %ebx
+; FALLBACK31-NEXT:    movzbl (%ecx), %edx
+; FALLBACK31-NEXT:    movl %edx, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    sarl $31, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $28, %dl
+; FALLBACK31-NEXT:    movzbl %dl, %eax
+; FALLBACK31-NEXT:    movl 36(%esp,%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 32(%esp,%eax), %edi
+; FALLBACK31-NEXT:    movl 48(%esp,%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 44(%esp,%eax), %edi
+; FALLBACK31-NEXT:    movl %edi, %ebx
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK31-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 52(%esp,%eax), %esi
+; FALLBACK31-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 40(%esp,%eax), %ebx
+; FALLBACK31-NEXT:    movl 56(%esp,%eax), %ebp
+; FALLBACK31-NEXT:    movl %ebx, %eax
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl %esi, %eax
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT:    sarxl %ecx, %eax, %edi
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %ebx, %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK31-NEXT:    movl %ebp, 24(%ecx)
+; FALLBACK31-NEXT:    movl %edi, 28(%ecx)
+; FALLBACK31-NEXT:    movl %edx, 16(%ecx)
+; FALLBACK31-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, (%ecx)
+; FALLBACK31-NEXT:    addl $108, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -1655,343 +8734,3643 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 
 define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_64bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rbx
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %r9
-; X64-SSE2-NEXT:    movq 32(%rdi), %r10
-; X64-SSE2-NEXT:    movq 40(%rdi), %r11
-; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
-; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
-; X64-SSE2-NEXT:    movl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $63, %esi
-; X64-SSE2-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT:    movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT:    movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT:    movq -80(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
-; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
-; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
-; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
-; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rax, (%rdx)
-; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT:    popq %rbx
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE42-LABEL: lshr_64bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
-; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
-; X64-SSE42-NEXT:    movl (%rsi), %eax
-; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $63, %eax
-; X64-SSE42-NEXT:    movups -128(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -112(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups -96(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT:    movups -80(%rsp,%rax), %xmm3
-; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
-;
-; X64-AVX1-LABEL: lshr_64bytes:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    movl (%rsi), %eax
-; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    andl $63, %eax
-; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax), %xmm3
-; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX512-LABEL: lshr_64bytes:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT:    movl (%rsi), %eax
-; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    andl $63, %eax
-; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X86-SSE2-LABEL: lshr_64bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $168, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %eax
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $63, %eax
-; X86-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 48(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 56(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 68(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 76(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 96(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
-; X86-SSE2-NEXT:    movl %edx, 60(%eax)
-; X86-SSE2-NEXT:    movl %esi, 48(%eax)
-; X86-SSE2-NEXT:    movl %edi, 52(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $168, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: lshr_64bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $128, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
-; X86-SSE42-NEXT:    movl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $63, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $128, %esp
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX1-LABEL: lshr_64bytes:
-; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    subl $128, %esp
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX1-NEXT:    vmovups 32(%edx), %ymm1
-; X86-AVX1-NEXT:    movl (%ecx), %ecx
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT:    andl $63, %ecx
-; X86-AVX1-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx), %xmm3
-; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT:    addl $128, %esp
-; X86-AVX1-NEXT:    vzeroupper
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX512-LABEL: lshr_64bytes:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    subl $128, %esp
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %zmm0
-; X86-AVX512-NEXT:    movl (%ecx), %ecx
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT:    vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT:    andl $63, %ecx
-; X86-AVX512-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX512-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX512-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX512-NEXT:    vmovups 48(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT:    addl $128, %esp
-; X86-AVX512-NEXT:    vzeroupper
-; X86-AVX512-NEXT:    retl
+; FALLBACK0-LABEL: lshr_64bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %r15
+; FALLBACK0-NEXT:    pushq %r14
+; FALLBACK0-NEXT:    pushq %r13
+; FALLBACK0-NEXT:    pushq %r12
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq 16(%rdi), %rax
+; FALLBACK0-NEXT:    movq 32(%rdi), %rcx
+; FALLBACK0-NEXT:    movq 48(%rdi), %r8
+; FALLBACK0-NEXT:    movq (%rdi), %r9
+; FALLBACK0-NEXT:    movq 8(%rdi), %r10
+; FALLBACK0-NEXT:    movq 24(%rdi), %r11
+; FALLBACK0-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK0-NEXT:    movq 56(%rdi), %r14
+; FALLBACK0-NEXT:    movl (%rsi), %edi
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK0-NEXT:    andl $56, %eax
+; FALLBACK0-NEXT:    andl $56, %edi
+; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK0-NEXT:    movq %r8, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %r14
+; FALLBACK0-NEXT:    leaq (%r14,%r14), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %r11, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    addq %r8, %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %rbx, %r8
+; FALLBACK0-NEXT:    movq %r9, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r15
+; FALLBACK0-NEXT:    movq -96(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    orq %r15, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r14
+; FALLBACK0-NEXT:    addq %r9, %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    orq %r14, %r9
+; FALLBACK0-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK0-NEXT:    movq %r14, %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r12
+; FALLBACK0-NEXT:    movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT:    leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r15
+; FALLBACK0-NEXT:    orq %r12, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    addq %r14, %r14
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r14
+; FALLBACK0-NEXT:    orq %rbx, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r13
+; FALLBACK0-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT:    leaq (%rdi,%rdi), %rbx
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    orq %r13, %rbx
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT:    movq %rbx, 48(%rdx)
+; FALLBACK0-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK0-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    popq %r12
+; FALLBACK0-NEXT:    popq %r13
+; FALLBACK0-NEXT:    popq %r14
+; FALLBACK0-NEXT:    popq %r15
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: lshr_64bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    pushq %r14
+; FALLBACK1-NEXT:    pushq %rbx
+; FALLBACK1-NEXT:    pushq %rax
+; FALLBACK1-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK1-NEXT:    movq 40(%rdi), %r8
+; FALLBACK1-NEXT:    movq 56(%rdi), %r9
+; FALLBACK1-NEXT:    movq (%rdi), %r10
+; FALLBACK1-NEXT:    movq 8(%rdi), %r11
+; FALLBACK1-NEXT:    movq 16(%rdi), %rbx
+; FALLBACK1-NEXT:    movq 32(%rdi), %r14
+; FALLBACK1-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK1-NEXT:    movl (%rsi), %eax
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK1-NEXT:    andl $56, %ecx
+; FALLBACK1-NEXT:    andl $56, %eax
+; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK1-NEXT:    movq %r9, %r8
+; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK1-NEXT:    movq %r10, %r11
+; FALLBACK1-NEXT:    shrdq %cl, %r9, %r11
+; FALLBACK1-NEXT:    shrdq %cl, %r10, %rdi
+; FALLBACK1-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK1-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK1-NEXT:    movq %rbx, %r14
+; FALLBACK1-NEXT:    shrdq %cl, %r10, %r14
+; FALLBACK1-NEXT:    shrdq %cl, %rbx, %r9
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shrq %cl, %rax
+; FALLBACK1-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK1-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK1-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK1-NEXT:    addq $8, %rsp
+; FALLBACK1-NEXT:    popq %rbx
+; FALLBACK1-NEXT:    popq %r14
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: lshr_64bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    pushq %rbp
+; FALLBACK2-NEXT:    pushq %r15
+; FALLBACK2-NEXT:    pushq %r14
+; FALLBACK2-NEXT:    pushq %r13
+; FALLBACK2-NEXT:    pushq %r12
+; FALLBACK2-NEXT:    pushq %rbx
+; FALLBACK2-NEXT:    pushq %rax
+; FALLBACK2-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK2-NEXT:    movq 32(%rdi), %r8
+; FALLBACK2-NEXT:    movq 48(%rdi), %r9
+; FALLBACK2-NEXT:    movq (%rdi), %r10
+; FALLBACK2-NEXT:    movq 8(%rdi), %r11
+; FALLBACK2-NEXT:    movq 24(%rdi), %rbx
+; FALLBACK2-NEXT:    movq 40(%rdi), %r14
+; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK2-NEXT:    movl (%rsi), %eax
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %rbx
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbp
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r9
+; FALLBACK2-NEXT:    shrxq %rcx, %r10, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r13
+; FALLBACK2-NEXT:    movl %ecx, %r12d
+; FALLBACK2-NEXT:    notb %r12b
+; FALLBACK2-NEXT:    addq %r10, %r10
+; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    addq %r8, %r8
+; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT:    orq %rbp, %r8
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %rbp
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r11, %rsi
+; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r9
+; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    addq %r14, %r14
+; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT:    orq %r13, %r11
+; FALLBACK2-NEXT:    addq %rax, %rax
+; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT:    orq %rbp, %rax
+; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK2-NEXT:    addq $8, %rsp
+; FALLBACK2-NEXT:    popq %rbx
+; FALLBACK2-NEXT:    popq %r12
+; FALLBACK2-NEXT:    popq %r13
+; FALLBACK2-NEXT:    popq %r14
+; FALLBACK2-NEXT:    popq %r15
+; FALLBACK2-NEXT:    popq %rbp
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: lshr_64bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    pushq %r14
+; FALLBACK3-NEXT:    pushq %rbx
+; FALLBACK3-NEXT:    pushq %rax
+; FALLBACK3-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK3-NEXT:    movq 40(%rdi), %r8
+; FALLBACK3-NEXT:    movq 56(%rdi), %r9
+; FALLBACK3-NEXT:    movq (%rdi), %r10
+; FALLBACK3-NEXT:    movq 8(%rdi), %r11
+; FALLBACK3-NEXT:    movq 16(%rdi), %rbx
+; FALLBACK3-NEXT:    movq 32(%rdi), %r14
+; FALLBACK3-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK3-NEXT:    movl (%rsi), %eax
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK3-NEXT:    andl $56, %ecx
+; FALLBACK3-NEXT:    andl $56, %eax
+; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK3-NEXT:    movq %r9, %r8
+; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK3-NEXT:    movq %r10, %r11
+; FALLBACK3-NEXT:    shrdq %cl, %r9, %r11
+; FALLBACK3-NEXT:    shrdq %cl, %r10, %rdi
+; FALLBACK3-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK3-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK3-NEXT:    movq %rbx, %r14
+; FALLBACK3-NEXT:    shrdq %cl, %r10, %r14
+; FALLBACK3-NEXT:    shrdq %cl, %rbx, %r9
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK3-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK3-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, (%rdx)
+; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK3-NEXT:    addq $8, %rsp
+; FALLBACK3-NEXT:    popq %rbx
+; FALLBACK3-NEXT:    popq %r14
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: lshr_64bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbp
+; FALLBACK4-NEXT:    pushq %r15
+; FALLBACK4-NEXT:    pushq %r14
+; FALLBACK4-NEXT:    pushq %r13
+; FALLBACK4-NEXT:    pushq %r12
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    pushq %rax
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT:    movl (%rsi), %r8d
+; FALLBACK4-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    leal (,%r8,8), %eax
+; FALLBACK4-NEXT:    andl $56, %eax
+; FALLBACK4-NEXT:    andl $56, %r8d
+; FALLBACK4-NEXT:    movq -128(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq -120(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -104(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq %r10, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbx
+; FALLBACK4-NEXT:    movq -96(%rsp,%r8), %r12
+; FALLBACK4-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r11
+; FALLBACK4-NEXT:    orq %rbx, %r11
+; FALLBACK4-NEXT:    movq -112(%rsp,%r8), %rbx
+; FALLBACK4-NEXT:    movq %rbx, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r14
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r14, %r10
+; FALLBACK4-NEXT:    movq -88(%rsp,%r8), %r14
+; FALLBACK4-NEXT:    movq %r14, %r13
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r13
+; FALLBACK4-NEXT:    movq -80(%rsp,%r8), %rbp
+; FALLBACK4-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r15
+; FALLBACK4-NEXT:    orq %r13, %r15
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r12
+; FALLBACK4-NEXT:    addq %r14, %r14
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r14
+; FALLBACK4-NEXT:    orq %r12, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbp
+; FALLBACK4-NEXT:    movq -72(%rsp,%r8), %r8
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %r12
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r12
+; FALLBACK4-NEXT:    orq %rbp, %r12
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    addq %rbx, %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r9, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK4-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK4-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    addq $8, %rsp
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    popq %r12
+; FALLBACK4-NEXT:    popq %r13
+; FALLBACK4-NEXT:    popq %r14
+; FALLBACK4-NEXT:    popq %r15
+; FALLBACK4-NEXT:    popq %rbp
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: lshr_64bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    pushq %r15
+; FALLBACK5-NEXT:    pushq %r14
+; FALLBACK5-NEXT:    pushq %rbx
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT:    movl (%rsi), %eax
+; FALLBACK5-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    andl $56, %ecx
+; FALLBACK5-NEXT:    andl $56, %eax
+; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq %r9, %rsi
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT:    movq %r10, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    movq %r11, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    movq %rax, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shrq %cl, %r11
+; FALLBACK5-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK5-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r14, (%rdx)
+; FALLBACK5-NEXT:    popq %rbx
+; FALLBACK5-NEXT:    popq %r14
+; FALLBACK5-NEXT:    popq %r15
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: lshr_64bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    pushq %rbp
+; FALLBACK6-NEXT:    pushq %r15
+; FALLBACK6-NEXT:    pushq %r14
+; FALLBACK6-NEXT:    pushq %r13
+; FALLBACK6-NEXT:    pushq %r12
+; FALLBACK6-NEXT:    pushq %rbx
+; FALLBACK6-NEXT:    pushq %rax
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    leal (,%rax,8), %esi
+; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    andl $56, %eax
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT:    movl %esi, %ebx
+; FALLBACK6-NEXT:    notb %bl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT:    orq %r11, %r8
+; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r9, %rdi
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT:    orq %r14, %r9
+; FALLBACK6-NEXT:    addq %r10, %r10
+; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT:    orq %r15, %r10
+; FALLBACK6-NEXT:    addq %rax, %rax
+; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT:    orq %r13, %rax
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %rbp, %rcx
+; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r8, (%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    popq %rbx
+; FALLBACK6-NEXT:    popq %r12
+; FALLBACK6-NEXT:    popq %r13
+; FALLBACK6-NEXT:    popq %r14
+; FALLBACK6-NEXT:    popq %r15
+; FALLBACK6-NEXT:    popq %rbp
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: lshr_64bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    pushq %r15
+; FALLBACK7-NEXT:    pushq %r14
+; FALLBACK7-NEXT:    pushq %rbx
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT:    movl (%rsi), %eax
+; FALLBACK7-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    andl $56, %ecx
+; FALLBACK7-NEXT:    andl $56, %eax
+; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq %r9, %rsi
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT:    movq %r10, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT:    movq %r11, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    movq %rax, %r15
+; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT:    shrxq %rcx, %r11, %r10
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r14, (%rdx)
+; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK7-NEXT:    popq %rbx
+; FALLBACK7-NEXT:    popq %r14
+; FALLBACK7-NEXT:    popq %r15
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: lshr_64bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbp
+; FALLBACK8-NEXT:    pushq %r15
+; FALLBACK8-NEXT:    pushq %r14
+; FALLBACK8-NEXT:    pushq %r13
+; FALLBACK8-NEXT:    pushq %r12
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    pushq %rax
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT:    movl (%rsi), %r9d
+; FALLBACK8-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    leal (,%r9,8), %eax
+; FALLBACK8-NEXT:    andl $56, %eax
+; FALLBACK8-NEXT:    andl $56, %r9d
+; FALLBACK8-NEXT:    movq -128(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq -120(%rsp,%r9), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -104(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq %r10, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbx
+; FALLBACK8-NEXT:    movq -96(%rsp,%r9), %r12
+; FALLBACK8-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r11
+; FALLBACK8-NEXT:    orq %rbx, %r11
+; FALLBACK8-NEXT:    movq -112(%rsp,%r9), %rbx
+; FALLBACK8-NEXT:    movq %rbx, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r14
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r14, %r10
+; FALLBACK8-NEXT:    movq -88(%rsp,%r9), %r14
+; FALLBACK8-NEXT:    movq %r14, %r13
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r13
+; FALLBACK8-NEXT:    movq -80(%rsp,%r9), %rbp
+; FALLBACK8-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r15
+; FALLBACK8-NEXT:    orq %r13, %r15
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r12
+; FALLBACK8-NEXT:    addq %r14, %r14
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r14
+; FALLBACK8-NEXT:    orq %r12, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbp
+; FALLBACK8-NEXT:    movq -72(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %r12
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r12
+; FALLBACK8-NEXT:    orq %rbp, %r12
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %rbx, %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r8, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK8-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK8-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    addq $8, %rsp
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    popq %r12
+; FALLBACK8-NEXT:    popq %r13
+; FALLBACK8-NEXT:    popq %r14
+; FALLBACK8-NEXT:    popq %r15
+; FALLBACK8-NEXT:    popq %rbp
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: lshr_64bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    pushq %r15
+; FALLBACK9-NEXT:    pushq %r14
+; FALLBACK9-NEXT:    pushq %rbx
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %ecx
+; FALLBACK9-NEXT:    andl $56, %eax
+; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq %r9, %rsi
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    movq %r10, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    movq %r11, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    movq %rax, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shrq %cl, %r11
+; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r14, (%rdx)
+; FALLBACK9-NEXT:    popq %rbx
+; FALLBACK9-NEXT:    popq %r14
+; FALLBACK9-NEXT:    popq %r15
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: lshr_64bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    pushq %rbp
+; FALLBACK10-NEXT:    pushq %r15
+; FALLBACK10-NEXT:    pushq %r14
+; FALLBACK10-NEXT:    pushq %r13
+; FALLBACK10-NEXT:    pushq %r12
+; FALLBACK10-NEXT:    pushq %rbx
+; FALLBACK10-NEXT:    pushq %rax
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    leal (,%rax,8), %esi
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    andl $56, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT:    movl %esi, %ebx
+; FALLBACK10-NEXT:    notb %bl
+; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT:    orq %r11, %r8
+; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r9, %rdi
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT:    orq %r14, %r9
+; FALLBACK10-NEXT:    addq %r10, %r10
+; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT:    orq %r15, %r10
+; FALLBACK10-NEXT:    addq %rax, %rax
+; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT:    orq %r13, %rax
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %rbp, %rcx
+; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r8, (%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    popq %rbx
+; FALLBACK10-NEXT:    popq %r12
+; FALLBACK10-NEXT:    popq %r13
+; FALLBACK10-NEXT:    popq %r14
+; FALLBACK10-NEXT:    popq %r15
+; FALLBACK10-NEXT:    popq %rbp
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: lshr_64bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    pushq %r15
+; FALLBACK11-NEXT:    pushq %r14
+; FALLBACK11-NEXT:    pushq %rbx
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %ecx
+; FALLBACK11-NEXT:    andl $56, %eax
+; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq %r9, %rsi
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT:    movq %r10, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    movq %r11, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    movq %rax, %r15
+; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT:    shrxq %rcx, %r11, %r10
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r14, (%rdx)
+; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK11-NEXT:    popq %rbx
+; FALLBACK11-NEXT:    popq %r14
+; FALLBACK11-NEXT:    popq %r15
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: lshr_64bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbp
+; FALLBACK12-NEXT:    pushq %r15
+; FALLBACK12-NEXT:    pushq %r14
+; FALLBACK12-NEXT:    pushq %r13
+; FALLBACK12-NEXT:    pushq %r12
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    pushq %rax
+; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT:    movl (%rsi), %r9d
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    leal (,%r9,8), %eax
+; FALLBACK12-NEXT:    andl $56, %eax
+; FALLBACK12-NEXT:    andl $56, %r9d
+; FALLBACK12-NEXT:    movq -128(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -104(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq %r10, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbx
+; FALLBACK12-NEXT:    movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r11
+; FALLBACK12-NEXT:    orq %rbx, %r11
+; FALLBACK12-NEXT:    movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT:    movq %rbx, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r14
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r14, %r10
+; FALLBACK12-NEXT:    movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT:    movq %r14, %r13
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r13
+; FALLBACK12-NEXT:    movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r15
+; FALLBACK12-NEXT:    orq %r13, %r15
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r12
+; FALLBACK12-NEXT:    addq %r14, %r14
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r14
+; FALLBACK12-NEXT:    orq %r12, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbp
+; FALLBACK12-NEXT:    movq -72(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r12
+; FALLBACK12-NEXT:    orq %rbp, %r12
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %rbx, %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r8, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    addq $8, %rsp
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    popq %r12
+; FALLBACK12-NEXT:    popq %r13
+; FALLBACK12-NEXT:    popq %r14
+; FALLBACK12-NEXT:    popq %r15
+; FALLBACK12-NEXT:    popq %rbp
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: lshr_64bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    pushq %r15
+; FALLBACK13-NEXT:    pushq %r14
+; FALLBACK13-NEXT:    pushq %rbx
+; FALLBACK13-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT:    movl (%rsi), %edi
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    leal (,%rdi,8), %ecx
+; FALLBACK13-NEXT:    andl $56, %ecx
+; FALLBACK13-NEXT:    andl $56, %edi
+; FALLBACK13-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK13-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK13-NEXT:    movq %r9, %rax
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK13-NEXT:    movq -112(%rsp,%rdi), %r10
+; FALLBACK13-NEXT:    movq %r10, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK13-NEXT:    movq -88(%rsp,%rdi), %r11
+; FALLBACK13-NEXT:    movq %r11, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK13-NEXT:    movq -72(%rsp,%rdi), %r11
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK13-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r14
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shrq %cl, %r11
+; FALLBACK13-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK13-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r14, (%rdx)
+; FALLBACK13-NEXT:    popq %rbx
+; FALLBACK13-NEXT:    popq %r14
+; FALLBACK13-NEXT:    popq %r15
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: lshr_64bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    pushq %rbp
+; FALLBACK14-NEXT:    pushq %r15
+; FALLBACK14-NEXT:    pushq %r14
+; FALLBACK14-NEXT:    pushq %r13
+; FALLBACK14-NEXT:    pushq %r12
+; FALLBACK14-NEXT:    pushq %rbx
+; FALLBACK14-NEXT:    pushq %rax
+; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT:    movl (%rsi), %esi
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %rax
+; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %rax, %r9
+; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT:    movl %ecx, %ebx
+; FALLBACK14-NEXT:    notb %bl
+; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %rbp
+; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT:    orq %r11, %r8
+; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    movq -80(%rsp,%rsi), %r12
+; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT:    movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r9, %rdi
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT:    orq %r14, %r9
+; FALLBACK14-NEXT:    addq %r10, %r10
+; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT:    orq %r15, %r10
+; FALLBACK14-NEXT:    addq %rsi, %rsi
+; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r13, %rsi
+; FALLBACK14-NEXT:    addq %rax, %rax
+; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT:    orq %rbp, %rax
+; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 48(%rdx)
+; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r8, (%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    popq %rbx
+; FALLBACK14-NEXT:    popq %r12
+; FALLBACK14-NEXT:    popq %r13
+; FALLBACK14-NEXT:    popq %r14
+; FALLBACK14-NEXT:    popq %r15
+; FALLBACK14-NEXT:    popq %rbp
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: lshr_64bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    pushq %r15
+; FALLBACK15-NEXT:    pushq %r14
+; FALLBACK15-NEXT:    pushq %rbx
+; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %ecx
+; FALLBACK15-NEXT:    andl $56, %eax
+; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq %r9, %rsi
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    movq %r10, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    movq %r11, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    movq %rax, %r15
+; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT:    shrxq %rcx, %r11, %r10
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r14, (%rdx)
+; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK15-NEXT:    popq %rbx
+; FALLBACK15-NEXT:    popq %r14
+; FALLBACK15-NEXT:    popq %r15
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: lshr_64bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $204, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%eax), %ebp
+; FALLBACK16-NEXT:    movl 44(%eax), %ebx
+; FALLBACK16-NEXT:    movl 48(%eax), %edi
+; FALLBACK16-NEXT:    movl 52(%eax), %esi
+; FALLBACK16-NEXT:    movl 56(%eax), %edx
+; FALLBACK16-NEXT:    movl 60(%eax), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    andl $60, %esi
+; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT:    shll $3, %eax
+; FALLBACK16-NEXT:    andl $24, %eax
+; FALLBACK16-NEXT:    movl %edx, %edi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT:    movb %al, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %edi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK16-NEXT:    movl %edx, %ebp
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %ebp, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %eax, %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %eax, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %ebp, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK16-NEXT:    movl %edx, 56(%eax)
+; FALLBACK16-NEXT:    movl %esi, 48(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl %edi, 40(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, (%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $204, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: lshr_64bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $188, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 20(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 36(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%ecx), %ebp
+; FALLBACK17-NEXT:    movl 44(%ecx), %ebx
+; FALLBACK17-NEXT:    movl 48(%ecx), %edi
+; FALLBACK17-NEXT:    movl 52(%ecx), %esi
+; FALLBACK17-NEXT:    movl 56(%ecx), %edx
+; FALLBACK17-NEXT:    movl 60(%ecx), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %ecx
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, %ebp
+; FALLBACK17-NEXT:    andl $60, %ebp
+; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shll $3, %ecx
+; FALLBACK17-NEXT:    andl $24, %ecx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %esi, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT:    shrl %cl, %eax
+; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, (%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    addl $188, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: lshr_64bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $204, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 36(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%eax), %ebp
+; FALLBACK18-NEXT:    movl 44(%eax), %ebx
+; FALLBACK18-NEXT:    movl 48(%eax), %edi
+; FALLBACK18-NEXT:    movl 52(%eax), %esi
+; FALLBACK18-NEXT:    movl 56(%eax), %edx
+; FALLBACK18-NEXT:    movl 60(%eax), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %eax
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    leal (,%eax,8), %edx
+; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    andl $60, %ecx
+; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
+; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl %ecx, %edi
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT:    shrxl %edx, %ebp, %edx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %edx, 60(%eax)
+; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK18-NEXT:    movl %edi, 48(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $204, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: lshr_64bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $188, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 20(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 36(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%ecx), %ebp
+; FALLBACK19-NEXT:    movl 44(%ecx), %ebx
+; FALLBACK19-NEXT:    movl 48(%ecx), %edi
+; FALLBACK19-NEXT:    movl 52(%ecx), %esi
+; FALLBACK19-NEXT:    movl 56(%ecx), %edx
+; FALLBACK19-NEXT:    movl 60(%ecx), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %ecx
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, %ebp
+; FALLBACK19-NEXT:    andl $60, %ebp
+; FALLBACK19-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shll $3, %ecx
+; FALLBACK19-NEXT:    andl $24, %ecx
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl %edi, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK19-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK19-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, (%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK19-NEXT:    addl $188, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: lshr_64bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $204, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    andl $60, %esi
+; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT:    shll $3, %eax
+; FALLBACK20-NEXT:    andl $24, %eax
+; FALLBACK20-NEXT:    movl %edx, %edi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %edi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT:    movl %edx, %ebp
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movl %edi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK20-NEXT:    movl %edx, 56(%eax)
+; FALLBACK20-NEXT:    movl %esi, 48(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl %edi, 40(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    addl $204, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: lshr_64bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $188, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT:    movl (%eax), %ecx
+; FALLBACK21-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ecx, %ebp
+; FALLBACK21-NEXT:    andl $60, %ebp
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shll $3, %ecx
+; FALLBACK21-NEXT:    andl $24, %ecx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl %esi, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    shrl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, (%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK21-NEXT:    addl $188, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: lshr_64bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $204, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT:    movl (%eax), %ecx
+; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    andl $60, %ecx
+; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ecx
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %edx
+; FALLBACK22-NEXT:    addl %ebp, %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl %edx, 60(%eax)
+; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK22-NEXT:    movl %edi, 48(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, (%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    addl $204, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: lshr_64bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $188, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT:    movl (%eax), %ecx
+; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %ecx, %ebp
+; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shll $3, %ecx
+; FALLBACK23-NEXT:    andl $24, %ecx
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %esi
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl %edi, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK23-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK23-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK23-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK23-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, (%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK23-NEXT:    addl $188, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: lshr_64bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT:    movl (%eax), %ecx
+; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, %esi
+; FALLBACK24-NEXT:    andl $60, %esi
+; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT:    shll $3, %ecx
+; FALLBACK24-NEXT:    andl $24, %ecx
+; FALLBACK24-NEXT:    movl %edx, %edi
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 72(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK24-NEXT:    movl %ecx, %ebp
+; FALLBACK24-NEXT:    movb %cl, %ch
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %edi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT:    movl %edx, %ebp
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %edi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK24-NEXT:    movl %edx, 56(%eax)
+; FALLBACK24-NEXT:    movl %esi, 48(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl %edi, 40(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    addl $204, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: lshr_64bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $188, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT:    movl (%eax), %ecx
+; FALLBACK25-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ecx, %ebp
+; FALLBACK25-NEXT:    andl $60, %ebp
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shll $3, %ecx
+; FALLBACK25-NEXT:    andl $24, %ecx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl %esi, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT:    shrl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, (%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK25-NEXT:    addl $188, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: lshr_64bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $204, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT:    movl (%eax), %ecx
+; FALLBACK26-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    andl $60, %ecx
+; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %ebp
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT:    movl 116(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT:    movl 124(%esp,%ecx), %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %ecx, %edx
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    movl %edx, 60(%ecx)
+; FALLBACK26-NEXT:    movl %ebx, 56(%ecx)
+; FALLBACK26-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK26-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, (%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK26-NEXT:    addl $204, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: lshr_64bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $188, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT:    movl (%eax), %ecx
+; FALLBACK27-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %ecx, %ebp
+; FALLBACK27-NEXT:    andl $60, %ebp
+; FALLBACK27-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shll $3, %ecx
+; FALLBACK27-NEXT:    andl $24, %ecx
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %esi
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl %edi, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK27-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK27-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK27-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK27-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, (%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK27-NEXT:    addl $188, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: lshr_64bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $204, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT:    movl (%eax), %ecx
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, %esi
+; FALLBACK28-NEXT:    andl $60, %esi
+; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT:    shll $3, %ecx
+; FALLBACK28-NEXT:    andl $24, %ecx
+; FALLBACK28-NEXT:    movl %edx, %edi
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 72(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK28-NEXT:    movl %ecx, %ebp
+; FALLBACK28-NEXT:    movb %cl, %ch
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %edi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT:    movl %edx, %ebp
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %edi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK28-NEXT:    movl %edx, 56(%eax)
+; FALLBACK28-NEXT:    movl %esi, 48(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl %edi, 40(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    addl $204, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: lshr_64bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $188, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT:    movl (%eax), %ecx
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ecx, %ebp
+; FALLBACK29-NEXT:    andl $60, %ebp
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shll $3, %ecx
+; FALLBACK29-NEXT:    andl $24, %ecx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl %esi, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT:    shrl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, (%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK29-NEXT:    addl $188, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: lshr_64bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $204, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT:    movl (%eax), %edx
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    leal (,%edx,8), %ecx
+; FALLBACK30-NEXT:    andl $24, %ecx
+; FALLBACK30-NEXT:    andl $60, %edx
+; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK30-NEXT:    movl %ecx, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 76(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 84(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 92(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 100(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 108(%esp,%edx), %esi
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 120(%esp,%edx), %ebp
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK30-NEXT:    movl 116(%esp,%edx), %eax
+; FALLBACK30-NEXT:    shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %eax
+; FALLBACK30-NEXT:    movl 124(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shrxl %ecx, %edx, %ebp
+; FALLBACK30-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %edx
+; FALLBACK30-NEXT:    orl %eax, %edx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT:    movl %edx, 56(%ecx)
+; FALLBACK30-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK30-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, (%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK30-NEXT:    addl $204, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: lshr_64bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $188, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT:    movl (%eax), %ecx
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %ecx, %ebp
+; FALLBACK31-NEXT:    andl $60, %ebp
+; FALLBACK31-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shll $3, %ecx
+; FALLBACK31-NEXT:    andl $24, %ecx
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %esi
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl %edi, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK31-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK31-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK31-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK31-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, (%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK31-NEXT:    addl $188, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3
@@ -2000,363 +12379,3775 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_64bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rbx
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %r9
-; X64-SSE2-NEXT:    movq 32(%rdi), %r10
-; X64-SSE2-NEXT:    movq 40(%rdi), %r11
-; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
-; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
-; X64-SSE2-NEXT:    movl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $63, %esi
-; X64-SSE2-NEXT:    negl %esi
-; X64-SSE2-NEXT:    movslq %esi, %rax
-; X64-SSE2-NEXT:    movq -64(%rsp,%rax), %rcx
-; X64-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rdi
-; X64-SSE2-NEXT:    movq -48(%rsp,%rax), %r8
-; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %r9
-; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %r10
-; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %r11
-; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
-; X64-SSE2-NEXT:    movq %rax, 48(%rdx)
-; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
-; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
-; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
-; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rcx, (%rdx)
-; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
-; X64-SSE2-NEXT:    popq %rbx
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE42-LABEL: shl_64bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
-; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
-; X64-SSE42-NEXT:    movl (%rsi), %eax
-; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $63, %eax
-; X64-SSE42-NEXT:    negl %eax
-; X64-SSE42-NEXT:    cltq
-; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm3
-; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
-;
-; X64-AVX1-LABEL: shl_64bytes:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    movl (%rsi), %eax
-; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    andl $63, %eax
-; X64-AVX1-NEXT:    negl %eax
-; X64-AVX1-NEXT:    cltq
-; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT:    vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX512-LABEL: shl_64bytes:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT:    movl (%rsi), %eax
-; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    andl $63, %eax
-; X64-AVX512-NEXT:    negl %eax
-; X64-AVX512-NEXT:    cltq
-; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X86-SSE2-LABEL: shl_64bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $168, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %eax
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $63, %eax
-; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    subl %eax, %ecx
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%ecx), %ebp
-; X86-SSE2-NEXT:    movl 40(%ecx), %ebx
-; X86-SSE2-NEXT:    movl 52(%ecx), %edi
-; X86-SSE2-NEXT:    movl 60(%ecx), %esi
-; X86-SSE2-NEXT:    movl 56(%ecx), %edx
-; X86-SSE2-NEXT:    negl %eax
-; X86-SSE2-NEXT:    movl 152(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-SSE2-NEXT:    movl %esi, 60(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 48(%eax)
-; X86-SSE2-NEXT:    movl %edi, 52(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $168, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: shl_64bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $128, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
-; X86-SSE42-NEXT:    movl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, (%esp)
-; X86-SSE42-NEXT:    movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $63, %ecx
-; X86-SSE42-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    subl %ecx, %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    negl %ecx
-; X86-SSE42-NEXT:    movups 112(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $128, %esp
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX1-LABEL: shl_64bytes:
-; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    subl $128, %esp
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX1-NEXT:    vmovups 32(%edx), %ymm1
-; X86-AVX1-NEXT:    movl (%ecx), %ecx
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm2, (%esp)
-; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    andl $63, %ecx
-; X86-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    subl %ecx, %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX1-NEXT:    vmovups 16(%edx), %xmm1
-; X86-AVX1-NEXT:    vmovups 32(%edx), %xmm2
-; X86-AVX1-NEXT:    negl %ecx
-; X86-AVX1-NEXT:    vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT:    addl $128, %esp
-; X86-AVX1-NEXT:    vzeroupper
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX512-LABEL: shl_64bytes:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    subl $128, %esp
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %zmm0
-; X86-AVX512-NEXT:    movl (%ecx), %ecx
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmovups %zmm1, (%esp)
-; X86-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT:    andl $63, %ecx
-; X86-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    subl %ecx, %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX512-NEXT:    vmovups 16(%edx), %xmm1
-; X86-AVX512-NEXT:    vmovups 32(%edx), %xmm2
-; X86-AVX512-NEXT:    negl %ecx
-; X86-AVX512-NEXT:    vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT:    addl $128, %esp
-; X86-AVX512-NEXT:    vzeroupper
-; X86-AVX512-NEXT:    retl
+; FALLBACK0-LABEL: shl_64bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %r15
+; FALLBACK0-NEXT:    pushq %r14
+; FALLBACK0-NEXT:    pushq %r13
+; FALLBACK0-NEXT:    pushq %r12
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq 16(%rdi), %rax
+; FALLBACK0-NEXT:    movq 32(%rdi), %rcx
+; FALLBACK0-NEXT:    movq 48(%rdi), %r8
+; FALLBACK0-NEXT:    movq (%rdi), %r9
+; FALLBACK0-NEXT:    movq 8(%rdi), %r10
+; FALLBACK0-NEXT:    movq 24(%rdi), %r11
+; FALLBACK0-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK0-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK0-NEXT:    movl (%rsi), %esi
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    andl $56, %eax
+; FALLBACK0-NEXT:    andl $56, %esi
+; FALLBACK0-NEXT:    negl %esi
+; FALLBACK0-NEXT:    movslq %esi, %rbx
+; FALLBACK0-NEXT:    movq -56(%rsp,%rbx), %rdi
+; FALLBACK0-NEXT:    movq -40(%rsp,%rbx), %r8
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -64(%rsp,%rbx), %r10
+; FALLBACK0-NEXT:    movq -48(%rsp,%rbx), %r14
+; FALLBACK0-NEXT:    movq %r10, %r9
+; FALLBACK0-NEXT:    shrq %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r9
+; FALLBACK0-NEXT:    orq %r11, %r9
+; FALLBACK0-NEXT:    movq %r8, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r15
+; FALLBACK0-NEXT:    movq %r14, %r11
+; FALLBACK0-NEXT:    shrq %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    orq %r15, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r14
+; FALLBACK0-NEXT:    shrq %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r14, %rdi
+; FALLBACK0-NEXT:    movq -24(%rsp,%rbx), %r14
+; FALLBACK0-NEXT:    movq %r14, %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r12
+; FALLBACK0-NEXT:    movq -32(%rsp,%rbx), %r13
+; FALLBACK0-NEXT:    movq %r13, %r15
+; FALLBACK0-NEXT:    shrq %r15
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r15
+; FALLBACK0-NEXT:    orq %r12, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r13
+; FALLBACK0-NEXT:    shrq %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r8
+; FALLBACK0-NEXT:    orq %r13, %r8
+; FALLBACK0-NEXT:    movq -8(%rsp,%rbx), %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r12
+; FALLBACK0-NEXT:    movq -16(%rsp,%rbx), %rbx
+; FALLBACK0-NEXT:    movq %rbx, %r13
+; FALLBACK0-NEXT:    shrq %r13
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r13
+; FALLBACK0-NEXT:    orq %r12, %r13
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    shrq %r14
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r14
+; FALLBACK0-NEXT:    orq %rbx, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    movq %r10, (%rdx)
+; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
+; FALLBACK0-NEXT:    movq %r13, 56(%rdx)
+; FALLBACK0-NEXT:    movq %r8, 32(%rdx)
+; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    popq %r12
+; FALLBACK0-NEXT:    popq %r13
+; FALLBACK0-NEXT:    popq %r14
+; FALLBACK0-NEXT:    popq %r15
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: shl_64bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    pushq %r14
+; FALLBACK1-NEXT:    pushq %rbx
+; FALLBACK1-NEXT:    pushq %rax
+; FALLBACK1-NEXT:    movq 24(%rdi), %rax
+; FALLBACK1-NEXT:    movq 40(%rdi), %rcx
+; FALLBACK1-NEXT:    movq 56(%rdi), %r8
+; FALLBACK1-NEXT:    movq (%rdi), %r9
+; FALLBACK1-NEXT:    movq 8(%rdi), %r10
+; FALLBACK1-NEXT:    movq 16(%rdi), %r11
+; FALLBACK1-NEXT:    movq 32(%rdi), %rbx
+; FALLBACK1-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK1-NEXT:    movl (%rsi), %esi
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    andl $56, %ecx
+; FALLBACK1-NEXT:    andl $56, %esi
+; FALLBACK1-NEXT:    negl %esi
+; FALLBACK1-NEXT:    movslq %esi, %r9
+; FALLBACK1-NEXT:    movq -64(%rsp,%r9), %rax
+; FALLBACK1-NEXT:    movq -48(%rsp,%r9), %rsi
+; FALLBACK1-NEXT:    movq -56(%rsp,%r9), %r10
+; FALLBACK1-NEXT:    movq -40(%rsp,%r9), %r11
+; FALLBACK1-NEXT:    movq %r10, %rdi
+; FALLBACK1-NEXT:    shldq %cl, %rax, %rdi
+; FALLBACK1-NEXT:    movq %r11, %r8
+; FALLBACK1-NEXT:    shldq %cl, %rsi, %r8
+; FALLBACK1-NEXT:    shldq %cl, %r10, %rsi
+; FALLBACK1-NEXT:    movq -32(%rsp,%r9), %r10
+; FALLBACK1-NEXT:    movq -24(%rsp,%r9), %rbx
+; FALLBACK1-NEXT:    movq %rbx, %r14
+; FALLBACK1-NEXT:    shldq %cl, %r10, %r14
+; FALLBACK1-NEXT:    shldq %cl, %r11, %r10
+; FALLBACK1-NEXT:    movq -16(%rsp,%r9), %r11
+; FALLBACK1-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK1-NEXT:    shldq %cl, %r11, %r9
+; FALLBACK1-NEXT:    shldq %cl, %rbx, %r11
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shlq %cl, %rax
+; FALLBACK1-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK1-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK1-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK1-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rax, (%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK1-NEXT:    addq $8, %rsp
+; FALLBACK1-NEXT:    popq %rbx
+; FALLBACK1-NEXT:    popq %r14
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: shl_64bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    pushq %rbp
+; FALLBACK2-NEXT:    pushq %r15
+; FALLBACK2-NEXT:    pushq %r14
+; FALLBACK2-NEXT:    pushq %r13
+; FALLBACK2-NEXT:    pushq %r12
+; FALLBACK2-NEXT:    pushq %rbx
+; FALLBACK2-NEXT:    pushq %rax
+; FALLBACK2-NEXT:    movq 16(%rdi), %rax
+; FALLBACK2-NEXT:    movq 32(%rdi), %rcx
+; FALLBACK2-NEXT:    movq 48(%rdi), %r8
+; FALLBACK2-NEXT:    movq (%rdi), %r9
+; FALLBACK2-NEXT:    movq 8(%rdi), %r10
+; FALLBACK2-NEXT:    movq 24(%rdi), %r11
+; FALLBACK2-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK2-NEXT:    movl (%rsi), %esi
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    andl $56, %esi
+; FALLBACK2-NEXT:    negl %esi
+; FALLBACK2-NEXT:    movslq %esi, %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rdi), %rsi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rdi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r9
+; FALLBACK2-NEXT:    movq -64(%rsp,%rdi), %r14
+; FALLBACK2-NEXT:    movq -48(%rsp,%rdi), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rbx
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r11
+; FALLBACK2-NEXT:    movq -24(%rsp,%rdi), %r8
+; FALLBACK2-NEXT:    shlxq %rax, %r8, %r15
+; FALLBACK2-NEXT:    shlxq %rax, %r14, %r12
+; FALLBACK2-NEXT:    movl %eax, %r13d
+; FALLBACK2-NEXT:    notb %r13b
+; FALLBACK2-NEXT:    shrq %r14
+; FALLBACK2-NEXT:    shrxq %r13, %r14, %r14
+; FALLBACK2-NEXT:    orq %r9, %r14
+; FALLBACK2-NEXT:    movq -32(%rsp,%rdi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rbp
+; FALLBACK2-NEXT:    shrq %r10
+; FALLBACK2-NEXT:    shrxq %r13, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    shlxq %rax, -8(%rsp,%rdi), %rbx
+; FALLBACK2-NEXT:    movq -16(%rsp,%rdi), %rdi
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rax
+; FALLBACK2-NEXT:    shrq %rsi
+; FALLBACK2-NEXT:    shrxq %r13, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r11, %rsi
+; FALLBACK2-NEXT:    shrq %r9
+; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
+; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    shrq %rcx
+; FALLBACK2-NEXT:    shrxq %r13, %rcx, %rcx
+; FALLBACK2-NEXT:    orq %rbp, %rcx
+; FALLBACK2-NEXT:    shrq %rdi
+; FALLBACK2-NEXT:    shrxq %r13, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %rbx, %rdi
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
+; FALLBACK2-NEXT:    orq %rax, %r8
+; FALLBACK2-NEXT:    movq %r12, (%rdx)
+; FALLBACK2-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rcx, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r14, 8(%rdx)
+; FALLBACK2-NEXT:    addq $8, %rsp
+; FALLBACK2-NEXT:    popq %rbx
+; FALLBACK2-NEXT:    popq %r12
+; FALLBACK2-NEXT:    popq %r13
+; FALLBACK2-NEXT:    popq %r14
+; FALLBACK2-NEXT:    popq %r15
+; FALLBACK2-NEXT:    popq %rbp
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: shl_64bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    pushq %r14
+; FALLBACK3-NEXT:    pushq %rbx
+; FALLBACK3-NEXT:    pushq %rax
+; FALLBACK3-NEXT:    movq 24(%rdi), %rax
+; FALLBACK3-NEXT:    movq 40(%rdi), %rcx
+; FALLBACK3-NEXT:    movq 56(%rdi), %r8
+; FALLBACK3-NEXT:    movq (%rdi), %r9
+; FALLBACK3-NEXT:    movq 8(%rdi), %r10
+; FALLBACK3-NEXT:    movq 16(%rdi), %r11
+; FALLBACK3-NEXT:    movq 32(%rdi), %rbx
+; FALLBACK3-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK3-NEXT:    movl (%rsi), %esi
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    andl $56, %ecx
+; FALLBACK3-NEXT:    andl $56, %esi
+; FALLBACK3-NEXT:    negl %esi
+; FALLBACK3-NEXT:    movslq %esi, %r8
+; FALLBACK3-NEXT:    movq -64(%rsp,%r8), %rdi
+; FALLBACK3-NEXT:    movq -48(%rsp,%r8), %rax
+; FALLBACK3-NEXT:    movq -56(%rsp,%r8), %r9
+; FALLBACK3-NEXT:    movq -40(%rsp,%r8), %r10
+; FALLBACK3-NEXT:    movq %r9, %rsi
+; FALLBACK3-NEXT:    shldq %cl, %rdi, %rsi
+; FALLBACK3-NEXT:    movq %r10, %r11
+; FALLBACK3-NEXT:    shldq %cl, %rax, %r11
+; FALLBACK3-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK3-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK3-NEXT:    movq -24(%rsp,%r8), %rbx
+; FALLBACK3-NEXT:    movq %rbx, %r14
+; FALLBACK3-NEXT:    shldq %cl, %r9, %r14
+; FALLBACK3-NEXT:    shldq %cl, %r10, %r9
+; FALLBACK3-NEXT:    movq -16(%rsp,%r8), %r10
+; FALLBACK3-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK3-NEXT:    shldq %cl, %r10, %r8
+; FALLBACK3-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK3-NEXT:    shlxq %rcx, %rdi, %rcx
+; FALLBACK3-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK3-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK3-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK3-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK3-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rcx, (%rdx)
+; FALLBACK3-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK3-NEXT:    addq $8, %rsp
+; FALLBACK3-NEXT:    popq %rbx
+; FALLBACK3-NEXT:    popq %r14
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: shl_64bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %r15
+; FALLBACK4-NEXT:    pushq %r14
+; FALLBACK4-NEXT:    pushq %r13
+; FALLBACK4-NEXT:    pushq %r12
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT:    movl (%rsi), %ecx
+; FALLBACK4-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK4-NEXT:    andl $56, %eax
+; FALLBACK4-NEXT:    andl $56, %ecx
+; FALLBACK4-NEXT:    negl %ecx
+; FALLBACK4-NEXT:    movslq %ecx, %r9
+; FALLBACK4-NEXT:    movq -32(%rsp,%r9), %rdi
+; FALLBACK4-NEXT:    movq %rdi, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %rbx
+; FALLBACK4-NEXT:    movq %rbx, %r8
+; FALLBACK4-NEXT:    shrq %r8
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    orq %r10, %r8
+; FALLBACK4-NEXT:    movq -24(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r11
+; FALLBACK4-NEXT:    shrq %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r11, %rdi
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r15
+; FALLBACK4-NEXT:    movq %r15, %r11
+; FALLBACK4-NEXT:    shrq %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    orq %rbx, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r15
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r14
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r12
+; FALLBACK4-NEXT:    movq %r12, %rbx
+; FALLBACK4-NEXT:    shrq %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r15, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r12
+; FALLBACK4-NEXT:    movq %r14, %r15
+; FALLBACK4-NEXT:    shrq %r15
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r15
+; FALLBACK4-NEXT:    orq %r12, %r15
+; FALLBACK4-NEXT:    movq -16(%rsp,%r9), %r12
+; FALLBACK4-NEXT:    movq %r12, %r13
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r13
+; FALLBACK4-NEXT:    shrq %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    orq %r13, %r10
+; FALLBACK4-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    shrq %r12
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r12
+; FALLBACK4-NEXT:    orq %r9, %r12
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r14
+; FALLBACK4-NEXT:    movq %r14, (%rdx)
+; FALLBACK4-NEXT:    movq %r12, 56(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK4-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK4-NEXT:    movq %r8, 32(%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    popq %r12
+; FALLBACK4-NEXT:    popq %r13
+; FALLBACK4-NEXT:    popq %r14
+; FALLBACK4-NEXT:    popq %r15
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: shl_64bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    pushq %r15
+; FALLBACK5-NEXT:    pushq %r14
+; FALLBACK5-NEXT:    pushq %rbx
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT:    movl (%rsi), %eax
+; FALLBACK5-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    andl $56, %ecx
+; FALLBACK5-NEXT:    andl $56, %eax
+; FALLBACK5-NEXT:    negl %eax
+; FALLBACK5-NEXT:    movslq %eax, %r8
+; FALLBACK5-NEXT:    movq -40(%rsp,%r8), %rax
+; FALLBACK5-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK5-NEXT:    movq %r9, %rsi
+; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK5-NEXT:    movq %r10, %rdi
+; FALLBACK5-NEXT:    shldq %cl, %r9, %rdi
+; FALLBACK5-NEXT:    movq -48(%rsp,%r8), %r9
+; FALLBACK5-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK5-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK5-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK5-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK5-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK5-NEXT:    movq %r14, %r15
+; FALLBACK5-NEXT:    shldq %cl, %r10, %r15
+; FALLBACK5-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK5-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK5-NEXT:    movq %r11, %r10
+; FALLBACK5-NEXT:    shlq %cl, %r10
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK5-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK5-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK5-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK5-NEXT:    movq %r10, (%rdx)
+; FALLBACK5-NEXT:    popq %rbx
+; FALLBACK5-NEXT:    popq %r14
+; FALLBACK5-NEXT:    popq %r15
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: shl_64bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    pushq %rbp
+; FALLBACK6-NEXT:    pushq %r15
+; FALLBACK6-NEXT:    pushq %r14
+; FALLBACK6-NEXT:    pushq %r13
+; FALLBACK6-NEXT:    pushq %r12
+; FALLBACK6-NEXT:    pushq %rbx
+; FALLBACK6-NEXT:    pushq %rax
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT:    movl (%rsi), %ecx
+; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    andl $56, %eax
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    negl %ecx
+; FALLBACK6-NEXT:    movslq %ecx, %rsi
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rax, %r8, %r12
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r13
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rbx
+; FALLBACK6-NEXT:    shlxq %rax, %rbx, %r11
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi), %r15
+; FALLBACK6-NEXT:    shlxq %rax, %r15, %r14
+; FALLBACK6-NEXT:    movl %eax, %r10d
+; FALLBACK6-NEXT:    notb %r10b
+; FALLBACK6-NEXT:    shrq %rdi
+; FALLBACK6-NEXT:    shrxq %r10, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r12, %rdi
+; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %r12
+; FALLBACK6-NEXT:    shlxq %rax, %r12, %rbp
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %r10, %r8, %r8
+; FALLBACK6-NEXT:    orq %r13, %r8
+; FALLBACK6-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r13
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    shlxq %rax, %rsi, %rax
+; FALLBACK6-NEXT:    shrq %rbx
+; FALLBACK6-NEXT:    shrxq %r10, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r9, %rbx
+; FALLBACK6-NEXT:    shrq %r15
+; FALLBACK6-NEXT:    shrxq %r10, %r15, %r9
+; FALLBACK6-NEXT:    orq %r11, %r9
+; FALLBACK6-NEXT:    shrq %rsi
+; FALLBACK6-NEXT:    shrxq %r10, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %r14, %rsi
+; FALLBACK6-NEXT:    shrq %rcx
+; FALLBACK6-NEXT:    shrxq %r10, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %rbp, %rcx
+; FALLBACK6-NEXT:    shrq %r12
+; FALLBACK6-NEXT:    shrxq %r10, %r12, %r10
+; FALLBACK6-NEXT:    orq %r13, %r10
+; FALLBACK6-NEXT:    movq %rax, (%rdx)
+; FALLBACK6-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    popq %rbx
+; FALLBACK6-NEXT:    popq %r12
+; FALLBACK6-NEXT:    popq %r13
+; FALLBACK6-NEXT:    popq %r14
+; FALLBACK6-NEXT:    popq %r15
+; FALLBACK6-NEXT:    popq %rbp
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: shl_64bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    pushq %r15
+; FALLBACK7-NEXT:    pushq %r14
+; FALLBACK7-NEXT:    pushq %rbx
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT:    movl (%rsi), %eax
+; FALLBACK7-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    andl $56, %ecx
+; FALLBACK7-NEXT:    andl $56, %eax
+; FALLBACK7-NEXT:    negl %eax
+; FALLBACK7-NEXT:    movslq %eax, %r8
+; FALLBACK7-NEXT:    movq -40(%rsp,%r8), %rax
+; FALLBACK7-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK7-NEXT:    movq %r9, %rsi
+; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK7-NEXT:    movq %r10, %rdi
+; FALLBACK7-NEXT:    shldq %cl, %r9, %rdi
+; FALLBACK7-NEXT:    movq -48(%rsp,%r8), %r9
+; FALLBACK7-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK7-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK7-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK7-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK7-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK7-NEXT:    movq %r14, %r15
+; FALLBACK7-NEXT:    shldq %cl, %r10, %r15
+; FALLBACK7-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK7-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK7-NEXT:    shlxq %rcx, %r11, %r10
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK7-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK7-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK7-NEXT:    movq %r10, (%rdx)
+; FALLBACK7-NEXT:    popq %rbx
+; FALLBACK7-NEXT:    popq %r14
+; FALLBACK7-NEXT:    popq %r15
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: shl_64bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %r15
+; FALLBACK8-NEXT:    pushq %r14
+; FALLBACK8-NEXT:    pushq %r13
+; FALLBACK8-NEXT:    pushq %r12
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT:    movl (%rsi), %ecx
+; FALLBACK8-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK8-NEXT:    andl $56, %eax
+; FALLBACK8-NEXT:    andl $56, %ecx
+; FALLBACK8-NEXT:    negl %ecx
+; FALLBACK8-NEXT:    movslq %ecx, %r9
+; FALLBACK8-NEXT:    movq -32(%rsp,%r9), %rdi
+; FALLBACK8-NEXT:    movq %rdi, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %rbx
+; FALLBACK8-NEXT:    movq %rbx, %r8
+; FALLBACK8-NEXT:    shrq %r8
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    orq %r10, %r8
+; FALLBACK8-NEXT:    movq -24(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r11
+; FALLBACK8-NEXT:    shrq %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r11, %rdi
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r15
+; FALLBACK8-NEXT:    movq %r15, %r11
+; FALLBACK8-NEXT:    shrq %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    orq %rbx, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r15
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r14
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r12
+; FALLBACK8-NEXT:    movq %r12, %rbx
+; FALLBACK8-NEXT:    shrq %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r15, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r12
+; FALLBACK8-NEXT:    movq %r14, %r15
+; FALLBACK8-NEXT:    shrq %r15
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r15
+; FALLBACK8-NEXT:    orq %r12, %r15
+; FALLBACK8-NEXT:    movq -16(%rsp,%r9), %r12
+; FALLBACK8-NEXT:    movq %r12, %r13
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r13
+; FALLBACK8-NEXT:    shrq %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    orq %r13, %r10
+; FALLBACK8-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    shrq %r12
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r12
+; FALLBACK8-NEXT:    orq %r9, %r12
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r14
+; FALLBACK8-NEXT:    movq %r14, (%rdx)
+; FALLBACK8-NEXT:    movq %r12, 56(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK8-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK8-NEXT:    movq %r8, 32(%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    popq %r12
+; FALLBACK8-NEXT:    popq %r13
+; FALLBACK8-NEXT:    popq %r14
+; FALLBACK8-NEXT:    popq %r15
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: shl_64bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    pushq %r15
+; FALLBACK9-NEXT:    pushq %r14
+; FALLBACK9-NEXT:    pushq %rbx
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %ecx
+; FALLBACK9-NEXT:    andl $56, %eax
+; FALLBACK9-NEXT:    negl %eax
+; FALLBACK9-NEXT:    movslq %eax, %r8
+; FALLBACK9-NEXT:    movq -40(%rsp,%r8), %rax
+; FALLBACK9-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK9-NEXT:    movq %r9, %rsi
+; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK9-NEXT:    movq %r10, %rdi
+; FALLBACK9-NEXT:    shldq %cl, %r9, %rdi
+; FALLBACK9-NEXT:    movq -48(%rsp,%r8), %r9
+; FALLBACK9-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK9-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK9-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK9-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK9-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK9-NEXT:    movq %r14, %r15
+; FALLBACK9-NEXT:    shldq %cl, %r10, %r15
+; FALLBACK9-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK9-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK9-NEXT:    movq %r11, %r10
+; FALLBACK9-NEXT:    shlq %cl, %r10
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK9-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK9-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK9-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %r10, (%rdx)
+; FALLBACK9-NEXT:    popq %rbx
+; FALLBACK9-NEXT:    popq %r14
+; FALLBACK9-NEXT:    popq %r15
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: shl_64bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    pushq %rbp
+; FALLBACK10-NEXT:    pushq %r15
+; FALLBACK10-NEXT:    pushq %r14
+; FALLBACK10-NEXT:    pushq %r13
+; FALLBACK10-NEXT:    pushq %r12
+; FALLBACK10-NEXT:    pushq %rbx
+; FALLBACK10-NEXT:    pushq %rax
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT:    movl (%rsi), %ecx
+; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    andl $56, %eax
+; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    negl %ecx
+; FALLBACK10-NEXT:    movslq %ecx, %rsi
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %r12
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r13
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rbx
+; FALLBACK10-NEXT:    shlxq %rax, %rbx, %r11
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi), %r15
+; FALLBACK10-NEXT:    shlxq %rax, %r15, %r14
+; FALLBACK10-NEXT:    movl %eax, %r10d
+; FALLBACK10-NEXT:    notb %r10b
+; FALLBACK10-NEXT:    shrq %rdi
+; FALLBACK10-NEXT:    shrxq %r10, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r12, %rdi
+; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %r12
+; FALLBACK10-NEXT:    shlxq %rax, %r12, %rbp
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %r10, %r8, %r8
+; FALLBACK10-NEXT:    orq %r13, %r8
+; FALLBACK10-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r13
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    shlxq %rax, %rsi, %rax
+; FALLBACK10-NEXT:    shrq %rbx
+; FALLBACK10-NEXT:    shrxq %r10, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r9, %rbx
+; FALLBACK10-NEXT:    shrq %r15
+; FALLBACK10-NEXT:    shrxq %r10, %r15, %r9
+; FALLBACK10-NEXT:    orq %r11, %r9
+; FALLBACK10-NEXT:    shrq %rsi
+; FALLBACK10-NEXT:    shrxq %r10, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %r14, %rsi
+; FALLBACK10-NEXT:    shrq %rcx
+; FALLBACK10-NEXT:    shrxq %r10, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %rbp, %rcx
+; FALLBACK10-NEXT:    shrq %r12
+; FALLBACK10-NEXT:    shrxq %r10, %r12, %r10
+; FALLBACK10-NEXT:    orq %r13, %r10
+; FALLBACK10-NEXT:    movq %rax, (%rdx)
+; FALLBACK10-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    popq %rbx
+; FALLBACK10-NEXT:    popq %r12
+; FALLBACK10-NEXT:    popq %r13
+; FALLBACK10-NEXT:    popq %r14
+; FALLBACK10-NEXT:    popq %r15
+; FALLBACK10-NEXT:    popq %rbp
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: shl_64bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    pushq %r15
+; FALLBACK11-NEXT:    pushq %r14
+; FALLBACK11-NEXT:    pushq %rbx
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %ecx
+; FALLBACK11-NEXT:    andl $56, %eax
+; FALLBACK11-NEXT:    negl %eax
+; FALLBACK11-NEXT:    movslq %eax, %r8
+; FALLBACK11-NEXT:    movq -40(%rsp,%r8), %rax
+; FALLBACK11-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK11-NEXT:    movq %r9, %rsi
+; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK11-NEXT:    movq %r10, %rdi
+; FALLBACK11-NEXT:    shldq %cl, %r9, %rdi
+; FALLBACK11-NEXT:    movq -48(%rsp,%r8), %r9
+; FALLBACK11-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK11-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK11-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK11-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK11-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK11-NEXT:    movq %r14, %r15
+; FALLBACK11-NEXT:    shldq %cl, %r10, %r15
+; FALLBACK11-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK11-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK11-NEXT:    shlxq %rcx, %r11, %r10
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK11-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK11-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK11-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK11-NEXT:    movq %r10, (%rdx)
+; FALLBACK11-NEXT:    popq %rbx
+; FALLBACK11-NEXT:    popq %r14
+; FALLBACK11-NEXT:    popq %r15
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: shl_64bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %r15
+; FALLBACK12-NEXT:    pushq %r14
+; FALLBACK12-NEXT:    pushq %r13
+; FALLBACK12-NEXT:    pushq %r12
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT:    movl (%rsi), %ecx
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK12-NEXT:    andl $56, %eax
+; FALLBACK12-NEXT:    andl $56, %ecx
+; FALLBACK12-NEXT:    negl %ecx
+; FALLBACK12-NEXT:    movslq %ecx, %r9
+; FALLBACK12-NEXT:    movq -32(%rsp,%r9), %rdi
+; FALLBACK12-NEXT:    movq %rdi, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %rbx
+; FALLBACK12-NEXT:    movq %rbx, %r8
+; FALLBACK12-NEXT:    shrq %r8
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    orq %r10, %r8
+; FALLBACK12-NEXT:    movq -24(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r11
+; FALLBACK12-NEXT:    shrq %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r11, %rdi
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r15
+; FALLBACK12-NEXT:    movq %r15, %r11
+; FALLBACK12-NEXT:    shrq %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    orq %rbx, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r15
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r14
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    movq %r12, %rbx
+; FALLBACK12-NEXT:    shrq %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r15, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r12
+; FALLBACK12-NEXT:    movq %r14, %r15
+; FALLBACK12-NEXT:    shrq %r15
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r15
+; FALLBACK12-NEXT:    orq %r12, %r15
+; FALLBACK12-NEXT:    movq -16(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    movq %r12, %r13
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r13
+; FALLBACK12-NEXT:    shrq %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    orq %r13, %r10
+; FALLBACK12-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    shrq %r12
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r12
+; FALLBACK12-NEXT:    orq %r9, %r12
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r14
+; FALLBACK12-NEXT:    movq %r14, (%rdx)
+; FALLBACK12-NEXT:    movq %r12, 56(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK12-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK12-NEXT:    movq %r8, 32(%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    popq %r12
+; FALLBACK12-NEXT:    popq %r13
+; FALLBACK12-NEXT:    popq %r14
+; FALLBACK12-NEXT:    popq %r15
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: shl_64bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    pushq %r15
+; FALLBACK13-NEXT:    pushq %r14
+; FALLBACK13-NEXT:    pushq %rbx
+; FALLBACK13-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT:    movl (%rsi), %eax
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    andl $56, %ecx
+; FALLBACK13-NEXT:    andl $56, %eax
+; FALLBACK13-NEXT:    negl %eax
+; FALLBACK13-NEXT:    movslq %eax, %r8
+; FALLBACK13-NEXT:    movq -40(%rsp,%r8), %rax
+; FALLBACK13-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK13-NEXT:    movq %r9, %rsi
+; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK13-NEXT:    movq %r10, %rdi
+; FALLBACK13-NEXT:    shldq %cl, %r9, %rdi
+; FALLBACK13-NEXT:    movq -48(%rsp,%r8), %r9
+; FALLBACK13-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK13-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK13-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK13-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK13-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK13-NEXT:    movq %r14, %r15
+; FALLBACK13-NEXT:    shldq %cl, %r10, %r15
+; FALLBACK13-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK13-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK13-NEXT:    movq %r11, %r10
+; FALLBACK13-NEXT:    shlq %cl, %r10
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK13-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK13-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK13-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT:    movq %r10, (%rdx)
+; FALLBACK13-NEXT:    popq %rbx
+; FALLBACK13-NEXT:    popq %r14
+; FALLBACK13-NEXT:    popq %r15
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: shl_64bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    pushq %rbp
+; FALLBACK14-NEXT:    pushq %r15
+; FALLBACK14-NEXT:    pushq %r14
+; FALLBACK14-NEXT:    pushq %r13
+; FALLBACK14-NEXT:    pushq %r12
+; FALLBACK14-NEXT:    pushq %rbx
+; FALLBACK14-NEXT:    pushq %rax
+; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT:    movl (%rsi), %ecx
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    andl $56, %eax
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    negl %ecx
+; FALLBACK14-NEXT:    movslq %ecx, %rsi
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %r12
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r13
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rbx
+; FALLBACK14-NEXT:    shlxq %rax, %rbx, %r11
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi), %r15
+; FALLBACK14-NEXT:    shlxq %rax, %r15, %r14
+; FALLBACK14-NEXT:    movl %eax, %r10d
+; FALLBACK14-NEXT:    notb %r10b
+; FALLBACK14-NEXT:    shrq %rdi
+; FALLBACK14-NEXT:    shrxq %r10, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r12, %rdi
+; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %r12
+; FALLBACK14-NEXT:    shlxq %rax, %r12, %rbp
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %r10, %r8, %r8
+; FALLBACK14-NEXT:    orq %r13, %r8
+; FALLBACK14-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r13
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shlxq %rax, %rsi, %rax
+; FALLBACK14-NEXT:    shrq %rbx
+; FALLBACK14-NEXT:    shrxq %r10, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r9, %rbx
+; FALLBACK14-NEXT:    shrq %r15
+; FALLBACK14-NEXT:    shrxq %r10, %r15, %r9
+; FALLBACK14-NEXT:    orq %r11, %r9
+; FALLBACK14-NEXT:    shrq %rsi
+; FALLBACK14-NEXT:    shrxq %r10, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r14, %rsi
+; FALLBACK14-NEXT:    shrq %rcx
+; FALLBACK14-NEXT:    shrxq %r10, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %rbp, %rcx
+; FALLBACK14-NEXT:    shrq %r12
+; FALLBACK14-NEXT:    shrxq %r10, %r12, %r10
+; FALLBACK14-NEXT:    orq %r13, %r10
+; FALLBACK14-NEXT:    movq %rax, (%rdx)
+; FALLBACK14-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    popq %rbx
+; FALLBACK14-NEXT:    popq %r12
+; FALLBACK14-NEXT:    popq %r13
+; FALLBACK14-NEXT:    popq %r14
+; FALLBACK14-NEXT:    popq %r15
+; FALLBACK14-NEXT:    popq %rbp
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: shl_64bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    pushq %r15
+; FALLBACK15-NEXT:    pushq %r14
+; FALLBACK15-NEXT:    pushq %rbx
+; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %ecx
+; FALLBACK15-NEXT:    andl $56, %eax
+; FALLBACK15-NEXT:    negl %eax
+; FALLBACK15-NEXT:    movslq %eax, %r8
+; FALLBACK15-NEXT:    movq -40(%rsp,%r8), %rax
+; FALLBACK15-NEXT:    movq -32(%rsp,%r8), %r9
+; FALLBACK15-NEXT:    movq %r9, %rsi
+; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK15-NEXT:    movq %r10, %rdi
+; FALLBACK15-NEXT:    shldq %cl, %r9, %rdi
+; FALLBACK15-NEXT:    movq -48(%rsp,%r8), %r9
+; FALLBACK15-NEXT:    shldq %cl, %r9, %rax
+; FALLBACK15-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK15-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK15-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK15-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK15-NEXT:    movq %r14, %r15
+; FALLBACK15-NEXT:    shldq %cl, %r10, %r15
+; FALLBACK15-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK15-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK15-NEXT:    shlxq %rcx, %r11, %r10
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK15-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK15-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK15-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %r10, (%rdx)
+; FALLBACK15-NEXT:    popq %rbx
+; FALLBACK15-NEXT:    popq %r14
+; FALLBACK15-NEXT:    popq %r15
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: shl_64bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $204, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%eax), %ebp
+; FALLBACK16-NEXT:    movl 44(%eax), %ebx
+; FALLBACK16-NEXT:    movl 48(%eax), %edi
+; FALLBACK16-NEXT:    movl 52(%eax), %esi
+; FALLBACK16-NEXT:    movl 56(%eax), %edx
+; FALLBACK16-NEXT:    movl 60(%eax), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    andl $60, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    subl %edx, %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ecx), %edx
+; FALLBACK16-NEXT:    movl %ecx, %ebp
+; FALLBACK16-NEXT:    shll $3, %eax
+; FALLBACK16-NEXT:    andl $24, %eax
+; FALLBACK16-NEXT:    movl %edx, %esi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %al, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 8(%ebp), %esi
+; FALLBACK16-NEXT:    movl %ebp, %edi
+; FALLBACK16-NEXT:    movl %esi, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    movl 20(%edi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 16(%edi), %esi
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 24(%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%edx), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 32(%edx), %esi
+; FALLBACK16-NEXT:    movl %edx, %ebp
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 40(%ebp), %esi
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 52(%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    negl %edx
+; FALLBACK16-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT:    movl 60(%edi), %edx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 56(%edi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %esi, 56(%eax)
+; FALLBACK16-NEXT:    movl %edi, 60(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $204, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: shl_64bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $188, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 20(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 36(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%ecx), %ebp
+; FALLBACK17-NEXT:    movl 44(%ecx), %ebx
+; FALLBACK17-NEXT:    movl 48(%ecx), %edi
+; FALLBACK17-NEXT:    movl 52(%ecx), %esi
+; FALLBACK17-NEXT:    movl 56(%ecx), %edx
+; FALLBACK17-NEXT:    movl 60(%ecx), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %ecx
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, %ebp
+; FALLBACK17-NEXT:    andl $60, %ebp
+; FALLBACK17-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    subl %ebp, %eax
+; FALLBACK17-NEXT:    movl 8(%eax), %esi
+; FALLBACK17-NEXT:    movl 12(%eax), %edx
+; FALLBACK17-NEXT:    shll $3, %ecx
+; FALLBACK17-NEXT:    andl $24, %ecx
+; FALLBACK17-NEXT:    movl %edx, %edi
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%eax), %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%eax), %edi
+; FALLBACK17-NEXT:    movl 20(%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%eax), %edi
+; FALLBACK17-NEXT:    movl 28(%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%eax), %edi
+; FALLBACK17-NEXT:    movl 36(%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%eax), %edx
+; FALLBACK17-NEXT:    movl 44(%eax), %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 56(%eax), %edx
+; FALLBACK17-NEXT:    movl 60(%eax), %edi
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl (%eax), %ebx
+; FALLBACK17-NEXT:    movl 52(%eax), %esi
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK17-NEXT:    negl %ebp
+; FALLBACK17-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT:    shll %cl, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK17-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK17-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, (%ebp)
+; FALLBACK17-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK17-NEXT:    addl $188, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: shl_64bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $204, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 36(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%eax), %ebx
+; FALLBACK18-NEXT:    movl 44(%eax), %edi
+; FALLBACK18-NEXT:    movl 48(%eax), %esi
+; FALLBACK18-NEXT:    movl 52(%eax), %edx
+; FALLBACK18-NEXT:    movl 56(%eax), %ecx
+; FALLBACK18-NEXT:    movl 60(%eax), %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK18-NEXT:    movl (%ebp), %ebp
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    leal (,%ebp,8), %edx
+; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    andl $60, %ebp
+; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT:    subl %ebp, %edi
+; FALLBACK18-NEXT:    movl (%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, %ecx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    movl 12(%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    movl 20(%edi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    movl 28(%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %eax, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    movl 36(%edi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    movl 44(%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %eax, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 52(%edi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %eax, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    negl %eax
+; FALLBACK18-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK18-NEXT:    movl 56(%edi), %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %edx, %esi
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, (%eax)
+; FALLBACK18-NEXT:    movl %esi, 56(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK18-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $204, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: shl_64bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $204, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl (%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 20(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 36(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%ebp), %ebx
+; FALLBACK19-NEXT:    movl 44(%ebp), %edi
+; FALLBACK19-NEXT:    movl 48(%ebp), %esi
+; FALLBACK19-NEXT:    movl 52(%ebp), %edx
+; FALLBACK19-NEXT:    movl 56(%ebp), %ecx
+; FALLBACK19-NEXT:    movl 60(%ebp), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl (%ebp), %ebp
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    leal (,%ebp,8), %ecx
+; FALLBACK19-NEXT:    andl $24, %ecx
+; FALLBACK19-NEXT:    andl $60, %ebp
+; FALLBACK19-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    subl %ebp, %eax
+; FALLBACK19-NEXT:    movl 4(%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%eax), %edi
+; FALLBACK19-NEXT:    movl 12(%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%eax), %edi
+; FALLBACK19-NEXT:    movl 20(%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%eax), %edi
+; FALLBACK19-NEXT:    movl 28(%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%eax), %edi
+; FALLBACK19-NEXT:    movl 36(%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%eax), %ebx
+; FALLBACK19-NEXT:    movl 44(%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT:    movl 56(%eax), %edx
+; FALLBACK19-NEXT:    movl 60(%eax), %edi
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl (%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 52(%eax), %esi
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK19-NEXT:    negl %ebp
+; FALLBACK19-NEXT:    movl 176(%esp,%ebp), %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl %edx, 56(%eax)
+; FALLBACK19-NEXT:    movl %edi, 60(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK19-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK19-NEXT:    movl %esi, 52(%eax)
+; FALLBACK19-NEXT:    movl %ebx, 40(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK19-NEXT:    movl %edi, 4(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, (%eax)
+; FALLBACK19-NEXT:    addl $204, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: shl_64bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $204, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    andl $60, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    subl %edx, %ecx
+; FALLBACK20-NEXT:    movl (%ecx), %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 4(%ecx), %edx
+; FALLBACK20-NEXT:    movl %ecx, %ebp
+; FALLBACK20-NEXT:    shll $3, %eax
+; FALLBACK20-NEXT:    andl $24, %eax
+; FALLBACK20-NEXT:    movl %edx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 8(%ebp), %esi
+; FALLBACK20-NEXT:    movl %ebp, %edi
+; FALLBACK20-NEXT:    movl %esi, %ebp
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edi, %ebp
+; FALLBACK20-NEXT:    movl 20(%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 16(%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %ebp, %edx
+; FALLBACK20-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 24(%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %esi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 36(%edx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 32(%edx), %esi
+; FALLBACK20-NEXT:    movl %edx, %ebp
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 40(%ebp), %esi
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 52(%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    negl %edx
+; FALLBACK20-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    movl 60(%edi), %edx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 56(%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %edx, (%eax)
+; FALLBACK20-NEXT:    movl %esi, 56(%eax)
+; FALLBACK20-NEXT:    movl %edi, 60(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    addl $204, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: shl_64bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $188, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT:    movl (%eax), %ecx
+; FALLBACK21-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ecx, %ebp
+; FALLBACK21-NEXT:    andl $60, %ebp
+; FALLBACK21-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    subl %ebp, %eax
+; FALLBACK21-NEXT:    movl 8(%eax), %esi
+; FALLBACK21-NEXT:    movl 12(%eax), %edx
+; FALLBACK21-NEXT:    shll $3, %ecx
+; FALLBACK21-NEXT:    andl $24, %ecx
+; FALLBACK21-NEXT:    movl %edx, %edi
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 4(%eax), %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 16(%eax), %edi
+; FALLBACK21-NEXT:    movl 20(%eax), %esi
+; FALLBACK21-NEXT:    movl %esi, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 24(%eax), %edi
+; FALLBACK21-NEXT:    movl 28(%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 32(%eax), %edi
+; FALLBACK21-NEXT:    movl 36(%eax), %esi
+; FALLBACK21-NEXT:    movl %esi, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 40(%eax), %edx
+; FALLBACK21-NEXT:    movl 44(%eax), %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%eax), %edx
+; FALLBACK21-NEXT:    movl 60(%eax), %edi
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl (%eax), %ebx
+; FALLBACK21-NEXT:    movl 52(%eax), %esi
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK21-NEXT:    negl %ebp
+; FALLBACK21-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK21-NEXT:    shll %cl, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK21-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK21-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, (%ebp)
+; FALLBACK21-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK21-NEXT:    addl $188, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: shl_64bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $204, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT:    movl (%eax), %eax
+; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    leal (,%eax,8), %edx
+; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    andl $60, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK22-NEXT:    subl %eax, %edi
+; FALLBACK22-NEXT:    movl (%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 4(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 8(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, %ecx
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 12(%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 16(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    movl 20(%edi), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 24(%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 28(%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %eax, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 32(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    movl 36(%edi), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 40(%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 44(%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %eax, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 52(%edi), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %eax, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    negl %eax
+; FALLBACK22-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK22-NEXT:    movl 56(%edi), %eax
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %edx, %esi
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %edx, (%eax)
+; FALLBACK22-NEXT:    movl %esi, 56(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK22-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    addl $204, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: shl_64bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $204, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT:    movl (%eax), %ebp
+; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    leal (,%ebp,8), %ecx
+; FALLBACK23-NEXT:    andl $24, %ecx
+; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    subl %ebp, %eax
+; FALLBACK23-NEXT:    movl 4(%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 8(%eax), %edi
+; FALLBACK23-NEXT:    movl 12(%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 16(%eax), %edi
+; FALLBACK23-NEXT:    movl 20(%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 24(%eax), %edi
+; FALLBACK23-NEXT:    movl 28(%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 32(%eax), %edi
+; FALLBACK23-NEXT:    movl 36(%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 40(%eax), %ebx
+; FALLBACK23-NEXT:    movl 44(%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT:    movl 56(%eax), %edx
+; FALLBACK23-NEXT:    movl 60(%eax), %edi
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl (%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 52(%eax), %esi
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK23-NEXT:    negl %ebp
+; FALLBACK23-NEXT:    movl 176(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl %edx, 56(%eax)
+; FALLBACK23-NEXT:    movl %edi, 60(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK23-NEXT:    movl %esi, 52(%eax)
+; FALLBACK23-NEXT:    movl %ebx, 40(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK23-NEXT:    movl %edi, 4(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, (%eax)
+; FALLBACK23-NEXT:    addl $204, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: shl_64bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT:    movl (%eax), %eax
+; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    andl $60, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    subl %edx, %ecx
+; FALLBACK24-NEXT:    movl (%ecx), %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 4(%ecx), %edx
+; FALLBACK24-NEXT:    movl %ecx, %ebp
+; FALLBACK24-NEXT:    shll $3, %eax
+; FALLBACK24-NEXT:    andl $24, %eax
+; FALLBACK24-NEXT:    movl %edx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 8(%ebp), %esi
+; FALLBACK24-NEXT:    movl %ebp, %edi
+; FALLBACK24-NEXT:    movl %esi, %ebp
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edi, %ebp
+; FALLBACK24-NEXT:    movl 20(%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 16(%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %ebp, %edx
+; FALLBACK24-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 24(%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %esi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 36(%edx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 32(%edx), %esi
+; FALLBACK24-NEXT:    movl %edx, %ebp
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 40(%ebp), %esi
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 52(%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    negl %edx
+; FALLBACK24-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    movl 60(%edi), %edx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 56(%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %edx, (%eax)
+; FALLBACK24-NEXT:    movl %esi, 56(%eax)
+; FALLBACK24-NEXT:    movl %edi, 60(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    addl $204, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: shl_64bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $188, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT:    movl (%eax), %ecx
+; FALLBACK25-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ecx, %ebp
+; FALLBACK25-NEXT:    andl $60, %ebp
+; FALLBACK25-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    subl %ebp, %eax
+; FALLBACK25-NEXT:    movl 8(%eax), %esi
+; FALLBACK25-NEXT:    movl 12(%eax), %edx
+; FALLBACK25-NEXT:    shll $3, %ecx
+; FALLBACK25-NEXT:    andl $24, %ecx
+; FALLBACK25-NEXT:    movl %edx, %edi
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 4(%eax), %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 16(%eax), %edi
+; FALLBACK25-NEXT:    movl 20(%eax), %esi
+; FALLBACK25-NEXT:    movl %esi, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 24(%eax), %edi
+; FALLBACK25-NEXT:    movl 28(%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 32(%eax), %edi
+; FALLBACK25-NEXT:    movl 36(%eax), %esi
+; FALLBACK25-NEXT:    movl %esi, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 40(%eax), %edx
+; FALLBACK25-NEXT:    movl 44(%eax), %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%eax), %edx
+; FALLBACK25-NEXT:    movl 60(%eax), %edi
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl (%eax), %ebx
+; FALLBACK25-NEXT:    movl 52(%eax), %esi
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK25-NEXT:    negl %ebp
+; FALLBACK25-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK25-NEXT:    shll %cl, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK25-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK25-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, (%ebp)
+; FALLBACK25-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK25-NEXT:    addl $188, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: shl_64bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $204, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT:    movl (%eax), %eax
+; FALLBACK26-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    leal (,%eax,8), %edx
+; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    andl $60, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK26-NEXT:    subl %eax, %edi
+; FALLBACK26-NEXT:    movl (%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 4(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 8(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, %ecx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 12(%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 16(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    movl 20(%edi), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 24(%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 28(%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 32(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    movl 36(%edi), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 40(%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 44(%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 52(%edi), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %eax, %ebp
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    negl %eax
+; FALLBACK26-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK26-NEXT:    movl 56(%edi), %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %edx, %esi
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %edx, (%eax)
+; FALLBACK26-NEXT:    movl %esi, 56(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK26-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    addl $204, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: shl_64bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $204, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT:    movl (%eax), %ebx
+; FALLBACK27-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    leal (,%ebx,8), %ecx
+; FALLBACK27-NEXT:    andl $24, %ecx
+; FALLBACK27-NEXT:    andl $60, %ebx
+; FALLBACK27-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    subl %ebx, %eax
+; FALLBACK27-NEXT:    movl 4(%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 8(%eax), %edi
+; FALLBACK27-NEXT:    movl 12(%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 16(%eax), %edi
+; FALLBACK27-NEXT:    movl 20(%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 24(%eax), %edi
+; FALLBACK27-NEXT:    movl 28(%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 32(%eax), %edi
+; FALLBACK27-NEXT:    movl 36(%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 40(%eax), %ebp
+; FALLBACK27-NEXT:    movl 44(%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %ebp, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT:    movl 56(%eax), %edx
+; FALLBACK27-NEXT:    movl 60(%eax), %edi
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl (%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 52(%eax), %esi
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK27-NEXT:    negl %ebx
+; FALLBACK27-NEXT:    movl 176(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl %edx, 56(%eax)
+; FALLBACK27-NEXT:    movl %edi, 60(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    shldl %cl, %ebx, %esi
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT:    movl %ebx, 48(%eax)
+; FALLBACK27-NEXT:    movl %esi, 52(%eax)
+; FALLBACK27-NEXT:    movl %ebp, 40(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK27-NEXT:    movl %edi, 4(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, (%eax)
+; FALLBACK27-NEXT:    addl $204, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: shl_64bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $204, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT:    movl (%eax), %eax
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    andl $60, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    subl %edx, %ecx
+; FALLBACK28-NEXT:    movl (%ecx), %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 4(%ecx), %edx
+; FALLBACK28-NEXT:    movl %ecx, %ebp
+; FALLBACK28-NEXT:    shll $3, %eax
+; FALLBACK28-NEXT:    andl $24, %eax
+; FALLBACK28-NEXT:    movl %edx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 8(%ebp), %esi
+; FALLBACK28-NEXT:    movl %ebp, %edi
+; FALLBACK28-NEXT:    movl %esi, %ebp
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edi, %ebp
+; FALLBACK28-NEXT:    movl 20(%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 16(%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %ebp, %edx
+; FALLBACK28-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 24(%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %esi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 36(%edx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 32(%edx), %esi
+; FALLBACK28-NEXT:    movl %edx, %ebp
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 40(%ebp), %esi
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 52(%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    negl %edx
+; FALLBACK28-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    movl 60(%edi), %edx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 56(%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %edx, (%eax)
+; FALLBACK28-NEXT:    movl %esi, 56(%eax)
+; FALLBACK28-NEXT:    movl %edi, 60(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    addl $204, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: shl_64bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $188, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT:    movl (%eax), %ecx
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ecx, %ebp
+; FALLBACK29-NEXT:    andl $60, %ebp
+; FALLBACK29-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    subl %ebp, %eax
+; FALLBACK29-NEXT:    movl 8(%eax), %esi
+; FALLBACK29-NEXT:    movl 12(%eax), %edx
+; FALLBACK29-NEXT:    shll $3, %ecx
+; FALLBACK29-NEXT:    andl $24, %ecx
+; FALLBACK29-NEXT:    movl %edx, %edi
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 4(%eax), %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 16(%eax), %edi
+; FALLBACK29-NEXT:    movl 20(%eax), %esi
+; FALLBACK29-NEXT:    movl %esi, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 24(%eax), %edi
+; FALLBACK29-NEXT:    movl 28(%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 32(%eax), %edi
+; FALLBACK29-NEXT:    movl 36(%eax), %esi
+; FALLBACK29-NEXT:    movl %esi, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 40(%eax), %edx
+; FALLBACK29-NEXT:    movl 44(%eax), %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%eax), %edx
+; FALLBACK29-NEXT:    movl 60(%eax), %edi
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl (%eax), %ebx
+; FALLBACK29-NEXT:    movl 52(%eax), %esi
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK29-NEXT:    negl %ebp
+; FALLBACK29-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK29-NEXT:    shll %cl, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK29-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK29-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, (%ebp)
+; FALLBACK29-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK29-NEXT:    addl $188, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: shl_64bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $204, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT:    movl (%eax), %eax
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    leal (,%eax,8), %edx
+; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    andl $60, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK30-NEXT:    subl %eax, %edi
+; FALLBACK30-NEXT:    movl (%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 4(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 8(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, %ecx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 12(%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 16(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    movl 20(%edi), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 28(%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 32(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    movl 36(%edi), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 40(%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 44(%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 52(%edi), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %eax, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    negl %eax
+; FALLBACK30-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK30-NEXT:    movl 56(%edi), %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %edx, %esi
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %edx, (%eax)
+; FALLBACK30-NEXT:    movl %esi, 56(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK30-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    addl $204, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: shl_64bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $204, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT:    movl (%eax), %ebx
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    leal (,%ebx,8), %ecx
+; FALLBACK31-NEXT:    andl $24, %ecx
+; FALLBACK31-NEXT:    andl $60, %ebx
+; FALLBACK31-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    subl %ebx, %eax
+; FALLBACK31-NEXT:    movl 4(%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 8(%eax), %edi
+; FALLBACK31-NEXT:    movl 12(%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 16(%eax), %edi
+; FALLBACK31-NEXT:    movl 20(%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 24(%eax), %edi
+; FALLBACK31-NEXT:    movl 28(%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 32(%eax), %edi
+; FALLBACK31-NEXT:    movl 36(%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 40(%eax), %ebp
+; FALLBACK31-NEXT:    movl 44(%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %ebp, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT:    movl 56(%eax), %edx
+; FALLBACK31-NEXT:    movl 60(%eax), %edi
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl (%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 52(%eax), %esi
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK31-NEXT:    negl %ebx
+; FALLBACK31-NEXT:    movl 176(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl %edx, 56(%eax)
+; FALLBACK31-NEXT:    movl %edi, 60(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    shldl %cl, %ebx, %esi
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT:    movl %ebx, 48(%eax)
+; FALLBACK31-NEXT:    movl %esi, 52(%eax)
+; FALLBACK31-NEXT:    movl %ebp, 40(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK31-NEXT:    movl %edi, 4(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, (%eax)
+; FALLBACK31-NEXT:    addl $204, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3
@@ -2365,370 +16156,4089 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_64bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rbx
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %r9
-; X64-SSE2-NEXT:    movq 32(%rdi), %r10
-; X64-SSE2-NEXT:    movq 40(%rdi), %r11
-; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
-; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
-; X64-SSE2-NEXT:    movl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    sarq $63, %rdi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $63, %esi
-; X64-SSE2-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT:    movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT:    movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT:    movq -80(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
-; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
-; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
-; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
-; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rax, (%rdx)
-; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT:    popq %rbx
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE42-LABEL: ashr_64bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
-; X64-SSE42-NEXT:    movq 48(%rdi), %rax
-; X64-SSE42-NEXT:    movq 56(%rdi), %rcx
-; X64-SSE42-NEXT:    movl (%rsi), %esi
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    sarq $63, %rcx
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $63, %esi
-; X64-SSE42-NEXT:    movups -128(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT:    movups -112(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT:    movups -96(%rsp,%rsi), %xmm2
-; X64-SSE42-NEXT:    movups -80(%rsp,%rsi), %xmm3
-; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
-;
-; X64-AVX-LABEL: ashr_64bytes:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX-NEXT:    movq 48(%rdi), %rax
-; X64-AVX-NEXT:    movq 56(%rdi), %rcx
-; X64-AVX-NEXT:    movl (%rsi), %esi
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    sarq $63, %rcx
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $63, %esi
-; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi), %xmm2
-; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi), %xmm3
-; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    vzeroupper
-; X64-AVX-NEXT:    retq
-;
-; X86-SSE2-LABEL: ashr_64bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $168, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %eax
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    sarl $31, %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $63, %eax
-; X86-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 48(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 56(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 68(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 76(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 96(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
-; X86-SSE2-NEXT:    movl %edx, 60(%eax)
-; X86-SSE2-NEXT:    movl %esi, 48(%eax)
-; X86-SSE2-NEXT:    movl %edi, 52(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $168, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: ashr_64bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $128, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    movl 48(%edx), %esi
-; X86-SSE42-NEXT:    movl 52(%edx), %edi
-; X86-SSE42-NEXT:    movl 56(%edx), %ebx
-; X86-SSE42-NEXT:    movl 60(%edx), %edx
-; X86-SSE42-NEXT:    movl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $63, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $128, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
-; X86-SSE42-NEXT:    retl
-;
-; X86-AVX-LABEL: ashr_64bytes:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $128, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    vmovups 32(%edx), %xmm1
-; X86-AVX-NEXT:    movl 48(%edx), %esi
-; X86-AVX-NEXT:    movl 52(%edx), %edi
-; X86-AVX-NEXT:    movl 56(%edx), %ebx
-; X86-AVX-NEXT:    movl 60(%edx), %edx
-; X86-AVX-NEXT:    movl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $63, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm3
-; X86-AVX-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $128, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
-; X86-AVX-NEXT:    vzeroupper
-; X86-AVX-NEXT:    retl
+; FALLBACK0-LABEL: ashr_64bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %r15
+; FALLBACK0-NEXT:    pushq %r14
+; FALLBACK0-NEXT:    pushq %r13
+; FALLBACK0-NEXT:    pushq %r12
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq 16(%rdi), %rax
+; FALLBACK0-NEXT:    movq 32(%rdi), %rcx
+; FALLBACK0-NEXT:    movq 48(%rdi), %r8
+; FALLBACK0-NEXT:    movq (%rdi), %r9
+; FALLBACK0-NEXT:    movq 8(%rdi), %r10
+; FALLBACK0-NEXT:    movq 24(%rdi), %r11
+; FALLBACK0-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK0-NEXT:    movq 56(%rdi), %r14
+; FALLBACK0-NEXT:    movl (%rsi), %edi
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    sarq $63, %r14
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK0-NEXT:    andl $56, %eax
+; FALLBACK0-NEXT:    andl $56, %edi
+; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK0-NEXT:    movq %r8, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %r14
+; FALLBACK0-NEXT:    leaq (%r14,%r14), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %r11, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    addq %r8, %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %rbx, %r8
+; FALLBACK0-NEXT:    movq %r9, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r15
+; FALLBACK0-NEXT:    movq -96(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    orq %r15, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r14
+; FALLBACK0-NEXT:    addq %r9, %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    orq %r14, %r9
+; FALLBACK0-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK0-NEXT:    movq %r14, %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r12
+; FALLBACK0-NEXT:    movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT:    leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r15
+; FALLBACK0-NEXT:    orq %r12, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    addq %r14, %r14
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r14
+; FALLBACK0-NEXT:    orq %rbx, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r13
+; FALLBACK0-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT:    leaq (%rdi,%rdi), %rbx
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    orq %r13, %rbx
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    sarq %cl, %rdi
+; FALLBACK0-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT:    movq %rbx, 48(%rdx)
+; FALLBACK0-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK0-NEXT:    movq %r9, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    popq %r12
+; FALLBACK0-NEXT:    popq %r13
+; FALLBACK0-NEXT:    popq %r14
+; FALLBACK0-NEXT:    popq %r15
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: ashr_64bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    pushq %r14
+; FALLBACK1-NEXT:    pushq %rbx
+; FALLBACK1-NEXT:    pushq %rax
+; FALLBACK1-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK1-NEXT:    movq 40(%rdi), %r8
+; FALLBACK1-NEXT:    movq 56(%rdi), %r9
+; FALLBACK1-NEXT:    movq (%rdi), %r10
+; FALLBACK1-NEXT:    movq 8(%rdi), %r11
+; FALLBACK1-NEXT:    movq 16(%rdi), %rbx
+; FALLBACK1-NEXT:    movq 32(%rdi), %r14
+; FALLBACK1-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK1-NEXT:    movl (%rsi), %eax
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    sarq $63, %r9
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK1-NEXT:    andl $56, %ecx
+; FALLBACK1-NEXT:    andl $56, %eax
+; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK1-NEXT:    movq %r9, %r8
+; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK1-NEXT:    movq %r10, %r11
+; FALLBACK1-NEXT:    shrdq %cl, %r9, %r11
+; FALLBACK1-NEXT:    shrdq %cl, %r10, %rdi
+; FALLBACK1-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK1-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK1-NEXT:    movq %rbx, %r14
+; FALLBACK1-NEXT:    shrdq %cl, %r10, %r14
+; FALLBACK1-NEXT:    shrdq %cl, %rbx, %r9
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    sarq %cl, %rax
+; FALLBACK1-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK1-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK1-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK1-NEXT:    addq $8, %rsp
+; FALLBACK1-NEXT:    popq %rbx
+; FALLBACK1-NEXT:    popq %r14
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: ashr_64bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    pushq %rbp
+; FALLBACK2-NEXT:    pushq %r15
+; FALLBACK2-NEXT:    pushq %r14
+; FALLBACK2-NEXT:    pushq %r13
+; FALLBACK2-NEXT:    pushq %r12
+; FALLBACK2-NEXT:    pushq %rbx
+; FALLBACK2-NEXT:    pushq %rax
+; FALLBACK2-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK2-NEXT:    movq 32(%rdi), %r8
+; FALLBACK2-NEXT:    movq 48(%rdi), %r9
+; FALLBACK2-NEXT:    movq (%rdi), %r10
+; FALLBACK2-NEXT:    movq 8(%rdi), %r11
+; FALLBACK2-NEXT:    movq 24(%rdi), %rbx
+; FALLBACK2-NEXT:    movq 40(%rdi), %r14
+; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK2-NEXT:    movl (%rsi), %eax
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    sarq $63, %rdi
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %rbx
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbp
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r9
+; FALLBACK2-NEXT:    shrxq %rcx, %r10, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r13
+; FALLBACK2-NEXT:    movl %ecx, %r12d
+; FALLBACK2-NEXT:    notb %r12b
+; FALLBACK2-NEXT:    addq %r10, %r10
+; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    addq %r8, %r8
+; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT:    orq %rbp, %r8
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %rbp
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r11, %rsi
+; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r9
+; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    addq %r14, %r14
+; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT:    orq %r13, %r11
+; FALLBACK2-NEXT:    addq %rax, %rax
+; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT:    orq %rbp, %rax
+; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK2-NEXT:    addq $8, %rsp
+; FALLBACK2-NEXT:    popq %rbx
+; FALLBACK2-NEXT:    popq %r12
+; FALLBACK2-NEXT:    popq %r13
+; FALLBACK2-NEXT:    popq %r14
+; FALLBACK2-NEXT:    popq %r15
+; FALLBACK2-NEXT:    popq %rbp
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: ashr_64bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    pushq %r14
+; FALLBACK3-NEXT:    pushq %rbx
+; FALLBACK3-NEXT:    pushq %rax
+; FALLBACK3-NEXT:    movq 24(%rdi), %rcx
+; FALLBACK3-NEXT:    movq 40(%rdi), %r8
+; FALLBACK3-NEXT:    movq 56(%rdi), %r9
+; FALLBACK3-NEXT:    movq (%rdi), %r10
+; FALLBACK3-NEXT:    movq 8(%rdi), %r11
+; FALLBACK3-NEXT:    movq 16(%rdi), %rbx
+; FALLBACK3-NEXT:    movq 32(%rdi), %r14
+; FALLBACK3-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK3-NEXT:    movl (%rsi), %eax
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    sarq $63, %r9
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK3-NEXT:    andl $56, %ecx
+; FALLBACK3-NEXT:    andl $56, %eax
+; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK3-NEXT:    movq %r9, %r8
+; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK3-NEXT:    movq %r10, %r11
+; FALLBACK3-NEXT:    shrdq %cl, %r9, %r11
+; FALLBACK3-NEXT:    shrdq %cl, %r10, %rdi
+; FALLBACK3-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK3-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK3-NEXT:    movq %rbx, %r14
+; FALLBACK3-NEXT:    shrdq %cl, %r10, %r14
+; FALLBACK3-NEXT:    shrdq %cl, %rbx, %r9
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK3-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK3-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, (%rdx)
+; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK3-NEXT:    addq $8, %rsp
+; FALLBACK3-NEXT:    popq %rbx
+; FALLBACK3-NEXT:    popq %r14
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: ashr_64bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbp
+; FALLBACK4-NEXT:    pushq %r15
+; FALLBACK4-NEXT:    pushq %r14
+; FALLBACK4-NEXT:    pushq %r13
+; FALLBACK4-NEXT:    pushq %r12
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    pushq %rax
+; FALLBACK4-NEXT:    movq 56(%rdi), %rax
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK4-NEXT:    movl (%rsi), %edi
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    sarq $63, %rax
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK4-NEXT:    andl $56, %eax
+; FALLBACK4-NEXT:    andl $56, %edi
+; FALLBACK4-NEXT:    movq -128(%rsp,%rdi), %rbx
+; FALLBACK4-NEXT:    movq -112(%rsp,%rdi), %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbx
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -120(%rsp,%rdi), %r11
+; FALLBACK4-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK4-NEXT:    leaq (%r11,%r11), %r9
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    orq %rbx, %r9
+; FALLBACK4-NEXT:    movq %r10, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r14
+; FALLBACK4-NEXT:    movq -96(%rsp,%rdi), %r12
+; FALLBACK4-NEXT:    leaq (%r12,%r12), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r14, %rbx
+; FALLBACK4-NEXT:    movq %r8, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r14
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r14, %r10
+; FALLBACK4-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK4-NEXT:    movq %r14, %r13
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r13
+; FALLBACK4-NEXT:    movq -80(%rsp,%rdi), %rbp
+; FALLBACK4-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r15
+; FALLBACK4-NEXT:    orq %r13, %r15
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r12
+; FALLBACK4-NEXT:    addq %r14, %r14
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r14
+; FALLBACK4-NEXT:    orq %r12, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbp
+; FALLBACK4-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK4-NEXT:    leaq (%rdi,%rdi), %r12
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r12
+; FALLBACK4-NEXT:    orq %rbp, %r12
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    addq %r8, %r8
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r8
+; FALLBACK4-NEXT:    orq %r11, %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    sarq %cl, %rdi
+; FALLBACK4-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK4-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK4-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK4-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r9, (%rdx)
+; FALLBACK4-NEXT:    addq $8, %rsp
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    popq %r12
+; FALLBACK4-NEXT:    popq %r13
+; FALLBACK4-NEXT:    popq %r14
+; FALLBACK4-NEXT:    popq %r15
+; FALLBACK4-NEXT:    popq %rbp
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: ashr_64bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    pushq %r15
+; FALLBACK5-NEXT:    pushq %r14
+; FALLBACK5-NEXT:    pushq %rbx
+; FALLBACK5-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK5-NEXT:    movl (%rsi), %eax
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    sarq $63, %rdi
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    andl $56, %ecx
+; FALLBACK5-NEXT:    andl $56, %eax
+; FALLBACK5-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK5-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -112(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq %r10, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT:    movq %r11, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK5-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK5-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK5-NEXT:    movq %r14, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %r14, %r9
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    sarq %cl, %rax
+; FALLBACK5-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK5-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK5-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK5-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK5-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, (%rdx)
+; FALLBACK5-NEXT:    popq %rbx
+; FALLBACK5-NEXT:    popq %r14
+; FALLBACK5-NEXT:    popq %r15
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: ashr_64bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    pushq %rbp
+; FALLBACK6-NEXT:    pushq %r15
+; FALLBACK6-NEXT:    pushq %r14
+; FALLBACK6-NEXT:    pushq %r13
+; FALLBACK6-NEXT:    pushq %r12
+; FALLBACK6-NEXT:    pushq %rbx
+; FALLBACK6-NEXT:    pushq %rax
+; FALLBACK6-NEXT:    movq 56(%rdi), %rcx
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    sarq $63, %rcx
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    andl $56, %eax
+; FALLBACK6-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT:    shrxq %rcx, %rdi, %r12
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rsi
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT:    shrxq %rcx, %rsi, %r9
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    shrxq %rcx, %r10, %r14
+; FALLBACK6-NEXT:    shrxq %rcx, %r13, %r15
+; FALLBACK6-NEXT:    movl %ecx, %ebx
+; FALLBACK6-NEXT:    notb %bl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT:    orq %r11, %r8
+; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT:    shrxq %rcx, %r12, %r13
+; FALLBACK6-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r9, %rdi
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT:    orq %r14, %r9
+; FALLBACK6-NEXT:    addq %r10, %r10
+; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT:    orq %r15, %r10
+; FALLBACK6-NEXT:    addq %rax, %rax
+; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT:    orq %r13, %rax
+; FALLBACK6-NEXT:    addq %rsi, %rsi
+; FALLBACK6-NEXT:    shlxq %rbx, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %rbp, %rsi
+; FALLBACK6-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r8, (%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    popq %rbx
+; FALLBACK6-NEXT:    popq %r12
+; FALLBACK6-NEXT:    popq %r13
+; FALLBACK6-NEXT:    popq %r14
+; FALLBACK6-NEXT:    popq %r15
+; FALLBACK6-NEXT:    popq %rbp
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: ashr_64bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    pushq %r15
+; FALLBACK7-NEXT:    pushq %r14
+; FALLBACK7-NEXT:    pushq %rbx
+; FALLBACK7-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK7-NEXT:    movl (%rsi), %eax
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    sarq $63, %rdi
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    andl $56, %ecx
+; FALLBACK7-NEXT:    andl $56, %eax
+; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r8
+; FALLBACK7-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK7-NEXT:    movq %r8, %r11
+; FALLBACK7-NEXT:    shrdq %cl, %r10, %r11
+; FALLBACK7-NEXT:    movq %r9, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r8, %rbx
+; FALLBACK7-NEXT:    movq -80(%rsp,%rax), %r8
+; FALLBACK7-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK7-NEXT:    movq %r14, %r15
+; FALLBACK7-NEXT:    shrdq %cl, %r8, %r15
+; FALLBACK7-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %rdi
+; FALLBACK7-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK7-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK7-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK7-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK7-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK7-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, (%rdx)
+; FALLBACK7-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK7-NEXT:    popq %rbx
+; FALLBACK7-NEXT:    popq %r14
+; FALLBACK7-NEXT:    popq %r15
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: ashr_64bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbp
+; FALLBACK8-NEXT:    pushq %r15
+; FALLBACK8-NEXT:    pushq %r14
+; FALLBACK8-NEXT:    pushq %r13
+; FALLBACK8-NEXT:    pushq %r12
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    pushq %rax
+; FALLBACK8-NEXT:    movq 56(%rdi), %rax
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK8-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK8-NEXT:    movl (%rsi), %edi
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    sarq $63, %rax
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK8-NEXT:    andl $56, %eax
+; FALLBACK8-NEXT:    andl $56, %edi
+; FALLBACK8-NEXT:    movq -128(%rsp,%rdi), %rbx
+; FALLBACK8-NEXT:    movq -112(%rsp,%rdi), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbx
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -120(%rsp,%rdi), %r11
+; FALLBACK8-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK8-NEXT:    leaq (%r11,%r11), %r9
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    orq %rbx, %r9
+; FALLBACK8-NEXT:    movq %r10, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r14
+; FALLBACK8-NEXT:    movq -96(%rsp,%rdi), %r12
+; FALLBACK8-NEXT:    leaq (%r12,%r12), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r14, %rbx
+; FALLBACK8-NEXT:    movq %r8, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r14
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r14, %r10
+; FALLBACK8-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK8-NEXT:    movq %r14, %r13
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r13
+; FALLBACK8-NEXT:    movq -80(%rsp,%rdi), %rbp
+; FALLBACK8-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r15
+; FALLBACK8-NEXT:    orq %r13, %r15
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r12
+; FALLBACK8-NEXT:    addq %r14, %r14
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r14
+; FALLBACK8-NEXT:    orq %r12, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbp
+; FALLBACK8-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK8-NEXT:    leaq (%rdi,%rdi), %r12
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r12
+; FALLBACK8-NEXT:    orq %rbp, %r12
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    addq %r8, %r8
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r8
+; FALLBACK8-NEXT:    orq %r11, %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    sarq %cl, %rdi
+; FALLBACK8-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK8-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK8-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK8-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r9, (%rdx)
+; FALLBACK8-NEXT:    addq $8, %rsp
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    popq %r12
+; FALLBACK8-NEXT:    popq %r13
+; FALLBACK8-NEXT:    popq %r14
+; FALLBACK8-NEXT:    popq %r15
+; FALLBACK8-NEXT:    popq %rbp
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: ashr_64bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    pushq %r15
+; FALLBACK9-NEXT:    pushq %r14
+; FALLBACK9-NEXT:    pushq %rbx
+; FALLBACK9-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK9-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    sarq $63, %rdi
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %ecx
+; FALLBACK9-NEXT:    andl $56, %eax
+; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq %r10, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT:    movq %r11, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK9-NEXT:    movq %r14, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %r14, %r9
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    sarq %cl, %rax
+; FALLBACK9-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK9-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK9-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK9-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK9-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, (%rdx)
+; FALLBACK9-NEXT:    popq %rbx
+; FALLBACK9-NEXT:    popq %r14
+; FALLBACK9-NEXT:    popq %r15
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: ashr_64bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    pushq %rbp
+; FALLBACK10-NEXT:    pushq %r15
+; FALLBACK10-NEXT:    pushq %r14
+; FALLBACK10-NEXT:    pushq %r13
+; FALLBACK10-NEXT:    pushq %r12
+; FALLBACK10-NEXT:    pushq %rbx
+; FALLBACK10-NEXT:    pushq %rax
+; FALLBACK10-NEXT:    movq 56(%rdi), %rcx
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK10-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    sarq $63, %rcx
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    andl $56, %eax
+; FALLBACK10-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT:    shrxq %rcx, %rdi, %r12
+; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rsi
+; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT:    shrxq %rcx, %rsi, %r9
+; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT:    shrxq %rcx, %r10, %r14
+; FALLBACK10-NEXT:    shrxq %rcx, %r13, %r15
+; FALLBACK10-NEXT:    movl %ecx, %ebx
+; FALLBACK10-NEXT:    notb %bl
+; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT:    orq %r11, %r8
+; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT:    shrxq %rcx, %r12, %r13
+; FALLBACK10-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r9, %rdi
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT:    orq %r14, %r9
+; FALLBACK10-NEXT:    addq %r10, %r10
+; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT:    orq %r15, %r10
+; FALLBACK10-NEXT:    addq %rax, %rax
+; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT:    orq %r13, %rax
+; FALLBACK10-NEXT:    addq %rsi, %rsi
+; FALLBACK10-NEXT:    shlxq %rbx, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %rbp, %rsi
+; FALLBACK10-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r8, (%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    popq %rbx
+; FALLBACK10-NEXT:    popq %r12
+; FALLBACK10-NEXT:    popq %r13
+; FALLBACK10-NEXT:    popq %r14
+; FALLBACK10-NEXT:    popq %r15
+; FALLBACK10-NEXT:    popq %rbp
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: ashr_64bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    pushq %r15
+; FALLBACK11-NEXT:    pushq %r14
+; FALLBACK11-NEXT:    pushq %rbx
+; FALLBACK11-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK11-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    sarq $63, %rdi
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %ecx
+; FALLBACK11-NEXT:    andl $56, %eax
+; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r8
+; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK11-NEXT:    movq %r8, %r11
+; FALLBACK11-NEXT:    shrdq %cl, %r10, %r11
+; FALLBACK11-NEXT:    movq %r9, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r8, %rbx
+; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r8
+; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK11-NEXT:    movq %r14, %r15
+; FALLBACK11-NEXT:    shrdq %cl, %r8, %r15
+; FALLBACK11-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %rdi
+; FALLBACK11-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK11-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK11-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK11-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK11-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK11-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, (%rdx)
+; FALLBACK11-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK11-NEXT:    popq %rbx
+; FALLBACK11-NEXT:    popq %r14
+; FALLBACK11-NEXT:    popq %r15
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: ashr_64bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbp
+; FALLBACK12-NEXT:    pushq %r15
+; FALLBACK12-NEXT:    pushq %r14
+; FALLBACK12-NEXT:    pushq %r13
+; FALLBACK12-NEXT:    pushq %r12
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    pushq %rax
+; FALLBACK12-NEXT:    movq 56(%rdi), %rax
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK12-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK12-NEXT:    movl (%rsi), %edi
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    sarq $63, %rax
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK12-NEXT:    andl $56, %eax
+; FALLBACK12-NEXT:    andl $56, %edi
+; FALLBACK12-NEXT:    movq -128(%rsp,%rdi), %rbx
+; FALLBACK12-NEXT:    movq -112(%rsp,%rdi), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbx
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -120(%rsp,%rdi), %r11
+; FALLBACK12-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK12-NEXT:    leaq (%r11,%r11), %r9
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    orq %rbx, %r9
+; FALLBACK12-NEXT:    movq %r10, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r14
+; FALLBACK12-NEXT:    movq -96(%rsp,%rdi), %r12
+; FALLBACK12-NEXT:    leaq (%r12,%r12), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r14, %rbx
+; FALLBACK12-NEXT:    movq %r8, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r14
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r14, %r10
+; FALLBACK12-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK12-NEXT:    movq %r14, %r13
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r13
+; FALLBACK12-NEXT:    movq -80(%rsp,%rdi), %rbp
+; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r15
+; FALLBACK12-NEXT:    orq %r13, %r15
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r12
+; FALLBACK12-NEXT:    addq %r14, %r14
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r14
+; FALLBACK12-NEXT:    orq %r12, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbp
+; FALLBACK12-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK12-NEXT:    leaq (%rdi,%rdi), %r12
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r12
+; FALLBACK12-NEXT:    orq %rbp, %r12
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    addq %r8, %r8
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r8
+; FALLBACK12-NEXT:    orq %r11, %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    sarq %cl, %rdi
+; FALLBACK12-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK12-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r9, (%rdx)
+; FALLBACK12-NEXT:    addq $8, %rsp
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    popq %r12
+; FALLBACK12-NEXT:    popq %r13
+; FALLBACK12-NEXT:    popq %r14
+; FALLBACK12-NEXT:    popq %r15
+; FALLBACK12-NEXT:    popq %rbp
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: ashr_64bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    pushq %r15
+; FALLBACK13-NEXT:    pushq %r14
+; FALLBACK13-NEXT:    pushq %rbx
+; FALLBACK13-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK13-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK13-NEXT:    movl (%rsi), %eax
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    sarq $63, %rdi
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    andl $56, %ecx
+; FALLBACK13-NEXT:    andl $56, %eax
+; FALLBACK13-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    movq -104(%rsp,%rax), %r10
+; FALLBACK13-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -112(%rsp,%rax), %r11
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK13-NEXT:    movq -96(%rsp,%rax), %r9
+; FALLBACK13-NEXT:    movq %r10, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT:    movq %r11, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK13-NEXT:    movq -80(%rsp,%rax), %r10
+; FALLBACK13-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK13-NEXT:    movq %r14, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %r14, %r9
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    sarq %cl, %rax
+; FALLBACK13-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK13-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK13-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK13-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK13-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, (%rdx)
+; FALLBACK13-NEXT:    popq %rbx
+; FALLBACK13-NEXT:    popq %r14
+; FALLBACK13-NEXT:    popq %r15
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: ashr_64bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    pushq %rbp
+; FALLBACK14-NEXT:    pushq %r15
+; FALLBACK14-NEXT:    pushq %r14
+; FALLBACK14-NEXT:    pushq %r13
+; FALLBACK14-NEXT:    pushq %r12
+; FALLBACK14-NEXT:    pushq %rbx
+; FALLBACK14-NEXT:    pushq %rax
+; FALLBACK14-NEXT:    movq 56(%rdi), %rcx
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK14-NEXT:    movq 48(%rdi), %rdi
+; FALLBACK14-NEXT:    movl (%rsi), %eax
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    sarq $63, %rcx
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    andl $56, %eax
+; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r11
+; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %rsi
+; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %r9
+; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT:    movl %ecx, %ebx
+; FALLBACK14-NEXT:    notb %bl
+; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT:    orq %r11, %r8
+; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK14-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r9, %rdi
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT:    orq %r14, %r9
+; FALLBACK14-NEXT:    addq %r10, %r10
+; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT:    orq %r15, %r10
+; FALLBACK14-NEXT:    addq %rax, %rax
+; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT:    orq %r13, %rax
+; FALLBACK14-NEXT:    addq %rsi, %rsi
+; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %rbp, %rsi
+; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r8, (%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    popq %rbx
+; FALLBACK14-NEXT:    popq %r12
+; FALLBACK14-NEXT:    popq %r13
+; FALLBACK14-NEXT:    popq %r14
+; FALLBACK14-NEXT:    popq %r15
+; FALLBACK14-NEXT:    popq %rbp
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: ashr_64bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    pushq %r15
+; FALLBACK15-NEXT:    pushq %r14
+; FALLBACK15-NEXT:    pushq %rbx
+; FALLBACK15-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK15-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    sarq $63, %rdi
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %ecx
+; FALLBACK15-NEXT:    andl $56, %eax
+; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r8
+; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    movq %r8, %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r10, %r11
+; FALLBACK15-NEXT:    movq %r9, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r8, %rbx
+; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r8
+; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK15-NEXT:    movq %r14, %r15
+; FALLBACK15-NEXT:    shrdq %cl, %r8, %r15
+; FALLBACK15-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %rdi
+; FALLBACK15-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK15-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK15-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK15-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK15-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK15-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, (%rdx)
+; FALLBACK15-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK15-NEXT:    popq %rbx
+; FALLBACK15-NEXT:    popq %r14
+; FALLBACK15-NEXT:    popq %r15
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: ashr_64bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $204, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%ecx), %ebx
+; FALLBACK16-NEXT:    movl 44(%ecx), %edi
+; FALLBACK16-NEXT:    movl 48(%ecx), %esi
+; FALLBACK16-NEXT:    movl 52(%ecx), %edx
+; FALLBACK16-NEXT:    movl 56(%ecx), %eax
+; FALLBACK16-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    movl (%ebp), %ebp
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    sarl $31, %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, %ecx
+; FALLBACK16-NEXT:    movl %ebp, %esi
+; FALLBACK16-NEXT:    andl $60, %esi
+; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT:    shll $3, %ecx
+; FALLBACK16-NEXT:    andl $24, %ecx
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 72(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    movl %ecx, %ebx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%esi), %eax
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%esi), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl 84(%esp,%esi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 88(%esp,%esi), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%edx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 100(%esp,%edx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 104(%esp,%edx), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT:    movl 108(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %edx, %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 112(%esp,%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl 116(%esp,%edx), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 120(%esp,%edx), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    sarl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK16-NEXT:    movl %edx, 56(%eax)
+; FALLBACK16-NEXT:    movl %esi, 48(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl %edi, 40(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, (%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $204, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: ashr_64bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $188, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl (%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 20(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 36(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%eax), %ebp
+; FALLBACK17-NEXT:    movl 44(%eax), %ebx
+; FALLBACK17-NEXT:    movl 48(%eax), %edi
+; FALLBACK17-NEXT:    movl 52(%eax), %esi
+; FALLBACK17-NEXT:    movl 56(%eax), %edx
+; FALLBACK17-NEXT:    movl 60(%eax), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %ecx
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    sarl $31, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, %ebp
+; FALLBACK17-NEXT:    andl $60, %ebp
+; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shll $3, %ecx
+; FALLBACK17-NEXT:    andl $24, %ecx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %esi, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT:    sarl %cl, %eax
+; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, (%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    addl $188, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: ashr_64bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $204, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 36(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%eax), %ebp
+; FALLBACK18-NEXT:    movl 44(%eax), %ebx
+; FALLBACK18-NEXT:    movl 48(%eax), %edi
+; FALLBACK18-NEXT:    movl 52(%eax), %esi
+; FALLBACK18-NEXT:    movl 56(%eax), %edx
+; FALLBACK18-NEXT:    movl 60(%eax), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %eax
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    sarl $31, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    leal (,%eax,8), %edx
+; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    andl $60, %ecx
+; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl %ecx, %edi
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %edx, 60(%eax)
+; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK18-NEXT:    movl %edi, 48(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $204, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: ashr_64bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $188, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl (%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 20(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 36(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%eax), %ebp
+; FALLBACK19-NEXT:    movl 44(%eax), %ebx
+; FALLBACK19-NEXT:    movl 48(%eax), %edi
+; FALLBACK19-NEXT:    movl 52(%eax), %esi
+; FALLBACK19-NEXT:    movl 56(%eax), %edx
+; FALLBACK19-NEXT:    movl 60(%eax), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %ecx
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    sarl $31, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, %ebp
+; FALLBACK19-NEXT:    andl $60, %ebp
+; FALLBACK19-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shll $3, %ecx
+; FALLBACK19-NEXT:    andl $24, %ecx
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl %edi, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK19-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK19-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, (%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK19-NEXT:    addl $188, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: ashr_64bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $204, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT:    movl 48(%ecx), %edx
+; FALLBACK20-NEXT:    movl 52(%ecx), %esi
+; FALLBACK20-NEXT:    movl 56(%ecx), %edi
+; FALLBACK20-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    sarl $31, %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    andl $60, %esi
+; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT:    shll $3, %eax
+; FALLBACK20-NEXT:    andl $24, %eax
+; FALLBACK20-NEXT:    movl %edx, %edi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %edi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT:    movl %edx, %ebp
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movl %edi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    sarl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK20-NEXT:    movl %edx, 56(%eax)
+; FALLBACK20-NEXT:    movl %esi, 48(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl %edi, 40(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    addl $204, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: ashr_64bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $188, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movups (%eax), %xmm0
+; FALLBACK21-NEXT:    movups 16(%eax), %xmm1
+; FALLBACK21-NEXT:    movups 32(%eax), %xmm2
+; FALLBACK21-NEXT:    movl 48(%eax), %edx
+; FALLBACK21-NEXT:    movl 52(%eax), %esi
+; FALLBACK21-NEXT:    movl 56(%eax), %edi
+; FALLBACK21-NEXT:    movl 60(%eax), %eax
+; FALLBACK21-NEXT:    movl (%ecx), %ecx
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    sarl $31, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ecx, %ebp
+; FALLBACK21-NEXT:    andl $60, %ebp
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shll $3, %ecx
+; FALLBACK21-NEXT:    andl $24, %ecx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl %esi, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    sarl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, (%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK21-NEXT:    addl $188, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: ashr_64bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $204, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT:    movl 48(%ecx), %edx
+; FALLBACK22-NEXT:    movl 52(%ecx), %esi
+; FALLBACK22-NEXT:    movl 56(%ecx), %edi
+; FALLBACK22-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK22-NEXT:    movl (%eax), %eax
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    sarl $31, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    leal (,%eax,8), %edx
+; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    andl $60, %ecx
+; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl %ecx, %edi
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ecx
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK22-NEXT:    addl %ebp, %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl %edx, 60(%eax)
+; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK22-NEXT:    movl %edi, 48(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, (%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    addl $204, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: ashr_64bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $188, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movups (%eax), %xmm0
+; FALLBACK23-NEXT:    movups 16(%eax), %xmm1
+; FALLBACK23-NEXT:    movups 32(%eax), %xmm2
+; FALLBACK23-NEXT:    movl 48(%eax), %edx
+; FALLBACK23-NEXT:    movl 52(%eax), %esi
+; FALLBACK23-NEXT:    movl 56(%eax), %edi
+; FALLBACK23-NEXT:    movl 60(%eax), %eax
+; FALLBACK23-NEXT:    movl (%ecx), %ecx
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    sarl $31, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %ecx, %ebp
+; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shll $3, %ecx
+; FALLBACK23-NEXT:    andl $24, %ecx
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %esi
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl %edi, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK23-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK23-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK23-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK23-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, (%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK23-NEXT:    addl $188, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: ashr_64bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK24-NEXT:    movl 48(%ecx), %edx
+; FALLBACK24-NEXT:    movl 52(%ecx), %esi
+; FALLBACK24-NEXT:    movl 56(%ecx), %edi
+; FALLBACK24-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK24-NEXT:    movl (%eax), %eax
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    sarl $31, %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    andl $60, %esi
+; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT:    shll $3, %eax
+; FALLBACK24-NEXT:    andl $24, %eax
+; FALLBACK24-NEXT:    movl %edx, %edi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %edi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT:    movl %edx, %ebp
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %edi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    sarl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK24-NEXT:    movl %edx, 56(%eax)
+; FALLBACK24-NEXT:    movl %esi, 48(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl %edi, 40(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    addl $204, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: ashr_64bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $188, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK25-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK25-NEXT:    movl 48(%eax), %edx
+; FALLBACK25-NEXT:    movl 52(%eax), %esi
+; FALLBACK25-NEXT:    movl 56(%eax), %edi
+; FALLBACK25-NEXT:    movl 60(%eax), %eax
+; FALLBACK25-NEXT:    movl (%ecx), %ecx
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    sarl $31, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ecx, %ebp
+; FALLBACK25-NEXT:    andl $60, %ebp
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shll $3, %ecx
+; FALLBACK25-NEXT:    andl $24, %ecx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl %esi, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT:    sarl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, (%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK25-NEXT:    addl $188, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: ashr_64bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $204, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK26-NEXT:    movl 48(%ecx), %edx
+; FALLBACK26-NEXT:    movl 52(%ecx), %esi
+; FALLBACK26-NEXT:    movl 56(%ecx), %edi
+; FALLBACK26-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK26-NEXT:    movl (%eax), %eax
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    sarl $31, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    leal (,%eax,8), %edx
+; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    andl $60, %ecx
+; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl %ecx, %edi
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %ecx
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK26-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK26-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK26-NEXT:    addl %ebp, %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl %edx, 60(%eax)
+; FALLBACK26-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK26-NEXT:    movl %edi, 48(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK26-NEXT:    movl %esi, 40(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, (%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    addl $204, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: ashr_64bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $188, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK27-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK27-NEXT:    movl 48(%eax), %edx
+; FALLBACK27-NEXT:    movl 52(%eax), %esi
+; FALLBACK27-NEXT:    movl 56(%eax), %edi
+; FALLBACK27-NEXT:    movl 60(%eax), %eax
+; FALLBACK27-NEXT:    movl (%ecx), %ecx
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    sarl $31, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %ecx, %ebp
+; FALLBACK27-NEXT:    andl $60, %ebp
+; FALLBACK27-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shll $3, %ecx
+; FALLBACK27-NEXT:    andl $24, %ecx
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %esi
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl %edi, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK27-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK27-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK27-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK27-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, (%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK27-NEXT:    addl $188, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: ashr_64bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $204, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK28-NEXT:    movl 48(%ecx), %edx
+; FALLBACK28-NEXT:    movl 52(%ecx), %esi
+; FALLBACK28-NEXT:    movl 56(%ecx), %edi
+; FALLBACK28-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK28-NEXT:    movl (%eax), %eax
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    sarl $31, %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, %esi
+; FALLBACK28-NEXT:    andl $60, %esi
+; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT:    shll $3, %eax
+; FALLBACK28-NEXT:    andl $24, %eax
+; FALLBACK28-NEXT:    movl %edx, %edi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %edi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT:    movl %edx, %ebp
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %edi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    sarl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK28-NEXT:    movl %edx, 56(%eax)
+; FALLBACK28-NEXT:    movl %esi, 48(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl %edi, 40(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    addl $204, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: ashr_64bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $188, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK29-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK29-NEXT:    movl 48(%eax), %edx
+; FALLBACK29-NEXT:    movl 52(%eax), %esi
+; FALLBACK29-NEXT:    movl 56(%eax), %edi
+; FALLBACK29-NEXT:    movl 60(%eax), %eax
+; FALLBACK29-NEXT:    movl (%ecx), %ecx
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    sarl $31, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ecx, %ebp
+; FALLBACK29-NEXT:    andl $60, %ebp
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shll $3, %ecx
+; FALLBACK29-NEXT:    andl $24, %ecx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl %esi, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT:    sarl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, (%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK29-NEXT:    addl $188, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: ashr_64bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $204, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK30-NEXT:    movl 48(%ecx), %edx
+; FALLBACK30-NEXT:    movl 52(%ecx), %esi
+; FALLBACK30-NEXT:    movl 56(%ecx), %edi
+; FALLBACK30-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK30-NEXT:    movl (%eax), %eax
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    sarl $31, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    leal (,%eax,8), %edx
+; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    andl $60, %ecx
+; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl %ecx, %edi
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %ecx
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK30-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK30-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK30-NEXT:    addl %ebp, %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK30-NEXT:    orl %eax, %ebx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl %edx, 60(%eax)
+; FALLBACK30-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK30-NEXT:    movl %edi, 48(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK30-NEXT:    movl %esi, 40(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, (%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    addl $204, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: ashr_64bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $188, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK31-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK31-NEXT:    movl 48(%eax), %edx
+; FALLBACK31-NEXT:    movl 52(%eax), %esi
+; FALLBACK31-NEXT:    movl 56(%eax), %edi
+; FALLBACK31-NEXT:    movl 60(%eax), %eax
+; FALLBACK31-NEXT:    movl (%ecx), %ecx
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    sarl $31, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %ecx, %ebp
+; FALLBACK31-NEXT:    andl $60, %ebp
+; FALLBACK31-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shll $3, %ecx
+; FALLBACK31-NEXT:    andl $24, %ecx
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %esi
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl %edi, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK31-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK31-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK31-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK31-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, (%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK31-NEXT:    addl $188, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3
@@ -2738,37 +20248,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK14: {{.*}}
-; FALLBACK15: {{.*}}
-; FALLBACK16: {{.*}}
-; FALLBACK17: {{.*}}
-; FALLBACK18: {{.*}}
-; FALLBACK19: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK20: {{.*}}
-; FALLBACK21: {{.*}}
-; FALLBACK22: {{.*}}
-; FALLBACK23: {{.*}}
-; FALLBACK24: {{.*}}
-; FALLBACK25: {{.*}}
-; FALLBACK26: {{.*}}
-; FALLBACK27: {{.*}}
-; FALLBACK28: {{.*}}
-; FALLBACK29: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK30: {{.*}}
-; FALLBACK31: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
 ; X64: {{.*}}
+; X64-AVX: {{.*}}
+; X64-AVX1: {{.*}}
+; X64-AVX512: {{.*}}
+; X64-SSE2: {{.*}}
+; X64-SSE42: {{.*}}
 ; X86: {{.*}}
+; X86-AVX: {{.*}}
+; X86-AVX1: {{.*}}
+; X86-AVX512: {{.*}}
+; X86-SSE2: {{.*}}
+; X86-SSE42: {{.*}}
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 5c9c81758d633..4e33deb825500 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,22 +588,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
@@ -641,7 +639,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -654,41 +652,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -701,51 +697,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -758,42 +752,40 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -884,64 +876,62 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $60, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $60, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -950,47 +940,48 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -999,30 +990,28 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %al, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
@@ -1044,7 +1033,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1053,47 +1042,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1180,61 +1171,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1247,42 +1238,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1295,52 +1286,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1353,43 +1344,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1406,44 +1397,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9,8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9,8), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
@@ -1459,110 +1449,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rdi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rdi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
@@ -1572,107 +1559,99 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi,4), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi,4), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax,4), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
@@ -1682,8 +1661,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 20(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
@@ -1691,7 +1670,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1704,77 +1683,73 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%eax,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%eax,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%eax,4), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%eax,4), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1787,100 +1762,99 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 20(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 32(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1893,78 +1867,74 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%eax,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%eax,4), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%eax,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%eax,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1980,121 +1950,119 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r10), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r10), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r10), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r10), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %cl, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -16(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
@@ -2102,9 +2070,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
@@ -2116,36 +2084,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2154,63 +2122,57 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $84, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $28, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
@@ -2219,52 +2181,52 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2273,7 +2235,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $84, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2286,78 +2248,80 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%ebx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2370,99 +2334,89 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 84(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 92(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2473,7 +2427,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2486,80 +2440,80 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $108, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%ebx), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 28(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 16(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2575,45 +2529,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9,8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9,8), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
@@ -2629,113 +2583,113 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rsi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rdi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rdi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rcx, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
@@ -2745,118 +2699,115 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%edx), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx,4), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebx,4), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2865,7 +2816,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2878,78 +2829,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%eax,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%eax,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%eax,4), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%eax,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%eax,4), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%eax,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2962,101 +2914,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 20(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 32(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3069,79 +3025,80 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $80, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%eax,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%eax,4), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%eax,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%eax,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%eax,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%eax,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %eax, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $80, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3157,181 +3114,171 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r13,%r13), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 48(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3342,83 +3289,80 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -3429,68 +3373,65 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3499,42 +3440,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3543,6 +3488,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3555,61 +3501,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3618,40 +3549,39 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
@@ -3660,11 +3590,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -3675,42 +3605,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edi), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -3723,19 +3653,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
@@ -3759,7 +3689,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3772,7 +3702,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3802,6 +3732,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -3816,6 +3750,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -3828,22 +3763,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
@@ -3929,7 +3848,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3942,7 +3861,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $184, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3963,24 +3882,28 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3988,6 +3911,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4000,129 +3924,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 56(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
@@ -4130,7 +4038,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
@@ -4150,7 +4058,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $184, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4163,7 +4071,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4193,6 +4101,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -4207,6 +4119,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4219,22 +4132,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
@@ -4318,7 +4215,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4338,65 +4235,62 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rbx), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rbx), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rbx), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rbx), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -4409,10 +4303,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%rbx), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
@@ -4429,11 +4323,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 32(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
@@ -4449,64 +4343,64 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r9), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r9), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r9), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r9), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r9), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r9), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r9), %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r9), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
@@ -4519,83 +4413,82 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -4608,63 +4501,63 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r8), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r8), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r8), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r8), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r8), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r8), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r8), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rdi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
@@ -4675,7 +4568,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4697,22 +4590,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4721,6 +4614,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4733,74 +4629,83 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl %ecx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
@@ -4808,18 +4713,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
@@ -4827,92 +4731,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
@@ -4936,7 +4820,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4949,7 +4833,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4979,6 +4863,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -4993,6 +4879,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5005,108 +4894,92 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %ebp, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 160(%esp,%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 160(%esp,%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 52(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5119,36 +4992,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $192, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -5163,6 +5038,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5180,131 +5058,119 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
@@ -5330,7 +5196,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5343,7 +5209,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $180, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -5364,7 +5230,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ebx), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ebx), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ebx), %esi
@@ -5373,13 +5239,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5387,6 +5255,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5405,22 +5276,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %ebx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
@@ -5455,7 +5310,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
@@ -5465,7 +5320,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 164(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 176(%esp,%ebx), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
@@ -5481,7 +5336,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
@@ -5502,7 +5357,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $180, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5517,183 +5372,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r13,%r13), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 48(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -5704,28 +5555,29 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
@@ -5734,54 +5586,53 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -5792,69 +5643,69 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -5863,7 +5714,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6124,7 +5975,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6137,7 +5988,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6295,7 +6146,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -6308,7 +6159,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $188, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6331,7 +6182,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
@@ -6343,7 +6194,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -6387,8 +6238,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
@@ -6403,15 +6254,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 60(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6421,12 +6272,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6436,11 +6287,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6449,26 +6300,26 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6477,11 +6328,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -6489,7 +6340,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
@@ -6503,7 +6354,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
@@ -6523,7 +6374,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $188, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6536,7 +6387,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6692,7 +6543,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $176, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 9ae1f270e8833..08d0eef07951c 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -432,30 +432,92 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -505,30 +567,92 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -577,30 +701,92 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -649,32 +835,134 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $36, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl 8(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $36, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -689,58 +977,123 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
-; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -756,58 +1109,136 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movw %ax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -822,58 +1253,136 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movl -64(%rsp,%rax), %eax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -888,60 +1397,191 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movl %ecx, %eax
+; X64-SHLD-NEXT:    shrb $6, %al
+; X64-SHLD-NEXT:    movzbl %al, %eax
+; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $64, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $64, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -956,70 +1596,288 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rcx
-; X64-NEXT:    movq -56(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, 8(%rdx)
-; X64-NEXT:    movq %rcx, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-NEXT:    movl 8(%esp,%ecx), %edi
-; X86-NEXT:    movl 12(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $92, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %ebx
+; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    addl $92, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -1034,84 +1892,155 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 }
 
 define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movzbl -128(%rsp,%rsi), %eax
-; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    addl %esi, %esi
+; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orl %eax, %esi
+; X64-NO-BMI2-NEXT:    movb %sil, (%rdx)
+; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-BMI2-NEXT:    andl $56, %eax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT:    addl %esi, %esi
+; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movb %cl, (%rdx)
+; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $136, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $136, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1127,84 +2056,155 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movw %ax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    addl %esi, %esi
+; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orl %eax, %esi
+; X64-NO-BMI2-NEXT:    movw %si, (%rdx)
+; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-BMI2-NEXT:    andl $56, %eax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT:    addl %esi, %esi
+; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $136, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $136, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1219,84 +2219,155 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movl -128(%rsp,%rsi), %eax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    addl %esi, %esi
+; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orl %eax, %esi
+; X64-NO-BMI2-NEXT:    movl %esi, (%rdx)
+; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-BMI2-NEXT:    andl $56, %eax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT:    addl %esi, %esi
+; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $136, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $136, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1311,86 +2382,216 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    pushq %rax
+; X64-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT:    andl $56, %esi
+; X64-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-SHLD-NEXT:    popq %rax
+; X64-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $128, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %esi
+; X86-SHLD-NEXT:    andl $60, %esi
+; X86-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X86-SHLD-NEXT:    movl (%esp,%esi), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%esi), %esi
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    andl $24, %ecx
+; X86-SHLD-NEXT:    movl %esi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $128, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1405,96 +2606,326 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-NEXT:    movq %rcx, 8(%rdx)
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-NEXT:    movl 8(%esp,%ecx), %edi
-; X86-NEXT:    movl 12(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $156, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edi
+; X86-SHLD-NEXT:    andl $60, %edi
+; X86-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-SHLD-NEXT:    movl 16(%esp,%edi), %eax
+; X86-SHLD-NEXT:    movl 20(%esp,%edi), %ebx
+; X86-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    andl $24, %ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl 28(%esp,%edi), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl 32(%esp,%edi), %edi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    addl $156, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1509,116 +2940,484 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 
 define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-NEXT:    movq -112(%rsp,%rsi), %rdi
-; X64-NEXT:    movq -104(%rsp,%rsi), %rsi
-; X64-NEXT:    movq %rsi, 24(%rdx)
-; X64-NEXT:    movq %rdi, 16(%rdx)
-; X64-NEXT:    movq %rcx, 8(%rdx)
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $136, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm0
-; X86-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %eax
-; X86-NEXT:    movl 8(%esp,%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esp,%eax), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 16(%esp,%eax), %esi
-; X86-NEXT:    movl 20(%esp,%eax), %edi
-; X86-NEXT:    movl 24(%esp,%eax), %ebx
-; X86-NEXT:    movl 28(%esp,%eax), %ebp
-; X86-NEXT:    movl 32(%esp,%eax), %edx
-; X86-NEXT:    movl 36(%esp,%eax), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebp, 20(%eax)
-; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $136, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $156, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edi
+; X86-SHLD-NEXT:    andl $60, %edi
+; X86-SHLD-NEXT:    movl 24(%esp,%edi), %edx
+; X86-SHLD-NEXT:    movl 20(%esp,%edi), %eax
+; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    andl $24, %ecx
+; X86-SHLD-NEXT:    movl %eax, %esi
+; X86-SHLD-NEXT:    movl %edx, %eax
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 28(%esp,%edi), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X86-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 36(%esp,%edi), %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-SHLD-NEXT:    movl 40(%esp,%edi), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl 44(%esp,%edi), %eax
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-SHLD-NEXT:    movl 48(%esp,%edi), %edi
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SHLD-NEXT:    movl %eax, 28(%edi)
+; X86-SHLD-NEXT:    movl %edx, 24(%edi)
+; X86-SHLD-NEXT:    movl %esi, 20(%edi)
+; X86-SHLD-NEXT:    movl %ebp, 16(%edi)
+; X86-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    movl %eax, 12(%edi)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    movl %eax, 8(%edi)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-SHLD-NEXT:    movl %ebx, (%edi)
+; X86-SHLD-NEXT:    addl $156, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1633,9 +3432,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
-; X64-NO-BMI2-HAVE-SHLD: {{.*}}
+; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
+; X86: {{.*}}
 ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 4a47e7613dfa6..aeb7b233f853d 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -603,32 +603,92 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -711,32 +771,92 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -818,32 +938,92 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -925,34 +1105,137 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $36, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-SHLD-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl 8(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $36, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -967,64 +1250,128 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; no @load_16byte_chunk_of_16byte_alloca
 
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
-; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1038,64 +1385,141 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movw %ax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1108,64 +1532,141 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movl -64(%rsp,%rax), %eax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1178,66 +1679,197 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movl %ecx, %eax
+; X64-SHLD-NEXT:    shrb $6, %al
+; X64-SHLD-NEXT:    movzbl %al, %eax
+; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $64, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $64, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1250,76 +1882,295 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rcx
-; X64-NEXT:    movq -56(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, 8(%rdx)
-; X64-NEXT:    movq %rcx, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-NEXT:    movl 8(%esp,%ecx), %edi
-; X86-NEXT:    movl 12(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $92, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %ebx
+; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    addl $92, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1334,7 +2185,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
+; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86: {{.*}}
 ; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}



More information about the llvm-commits mailing list